🧼 remove v4.44 deprecations (#34245)

* remove v4.44 deprecations

* PR comments

* deprecations scheduled for v4.50

* hub version update

* make fiuxp

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Joao Gante 2024-11-15 22:07:24 +00:00 committed by GitHub
parent 8d50fda644
commit 13493215ab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 42 additions and 175 deletions

View File

@ -117,7 +117,7 @@ _deps = [
"fugashi>=1.0",
"GitPython<3.1.19",
"hf-doc-builder>=0.3.0",
"huggingface-hub>=0.23.2,<1.0",
"huggingface-hub>=0.24.0,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",

View File

@ -24,7 +24,7 @@ deps = {
"fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"huggingface-hub": "huggingface-hub>=0.23.2,<1.0",
"huggingface-hub": "huggingface-hub>=0.24.0,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",

View File

@ -94,7 +94,7 @@ from .utils import (
replace_return_docstrings,
strtobool,
)
from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files
from .utils.import_utils import (
ENV_VARS_TRUE_VALUES,
is_sagemaker_mp_enabled,
@ -381,92 +381,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
return False
def shard_checkpoint(
state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
):
"""
Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
given size.
The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
[6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
<Tip warning={true}>
If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
have a size greater than `max_shard_size`.
</Tip>
Args:
state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
The name of the model save file.
"""
logger.warning(
"Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using "
"split_torch_state_dict_into_shards from huggingface_hub library"
)
max_shard_size = convert_file_size_to_int(max_shard_size)
sharded_state_dicts = [{}]
last_block_size = 0
total_size = 0
storage_id_to_block = {}
for key, weight in state_dict.items():
# when bnb serialization is used the weights in the state dict can be strings
# check: https://github.com/huggingface/transformers/pull/24416 for more details
if isinstance(weight, str):
continue
else:
storage_id = id_tensor_storage(weight)
# If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
if storage_id in storage_id_to_block and weight.device != torch.device("meta"):
block_id = storage_id_to_block[storage_id]
sharded_state_dicts[block_id][key] = weight
continue
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
# If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
# weight in the current shard.
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
sharded_state_dicts.append({})
last_block_size = 0
sharded_state_dicts[-1][key] = weight
last_block_size += weight_size
total_size += weight_size
storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
# If we only have one shard, we return it
if len(sharded_state_dicts) == 1:
return {weights_name: sharded_state_dicts[0]}, None
# Otherwise, let's build the index
weight_map = {}
shards = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
shard_file = shard_file.replace(
".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
)
shards[shard_file] = shard
for key in shard.keys():
weight_map[key] = shard_file
# Add the metadata
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
return shards, index
def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
"""
This is the same as

View File

@ -2203,7 +2203,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(
@ -2326,7 +2326,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(

View File

@ -153,7 +153,7 @@ class Blip2Processor(ProcessorMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
# cast to desired return tensors type

View File

@ -1471,7 +1471,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(
@ -1610,7 +1610,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(

View File

@ -148,7 +148,7 @@ class InstructBlipProcessor(ProcessorMixin):
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
# cast to desired return tensors type after concatenating

View File

@ -485,7 +485,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
"Expanding inputs for image tokens in LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
# prefill stage vs decoding stage (legacy behavior copied)
if input_ids.shape[1] != 1:

View File

@ -160,7 +160,7 @@ class LlavaProcessor(ProcessorMixin):
"Expanding inputs for image tokens in LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])

View File

@ -868,7 +868,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
if input_ids.shape[1] != 1:
inputs_embeds = inputs_embeds.to(image_features.dtype)

View File

@ -143,7 +143,7 @@ class LlavaNextProcessor(ProcessorMixin):
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
else:
image_sizes = iter(image_inputs["image_sizes"])

View File

@ -21,10 +21,11 @@ import sys
import types
import torch
from huggingface_hub import split_torch_state_dict_into_shards
from packaging import version
from transformers import AutoTokenizer, GPT2Config
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
def add_checkpointing_args(parser):
@ -571,7 +572,15 @@ def convert_checkpoint_from_megatron_to_transformers(args):
# Store the state_dict to file.
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)
state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size)
shards = index = None
for tensors in state_dict_split.filename_to_tensors.values():
shards = {tensor: state_dict[tensor] for tensor in tensors}
if state_dict_split.is_sharded:
index = {
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}
# Save the model
for shard_file, shard in shards.items():

View File

@ -21,10 +21,10 @@ import os
import re
import torch
from huggingface_hub import hf_hub_download
from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
from transformers.modeling_utils import WEIGHTS_INDEX_NAME
NUM_HIDDEN_LAYERS_MAPPING = {
@ -116,7 +116,16 @@ def convert_rmkv_checkpoint_to_hf_format(
state_dict = convert_state_dict(state_dict)
# 4. Split in shards and save
shards, index = shard_checkpoint(state_dict)
state_dict_split = split_torch_state_dict_into_shards(state_dict)
shards = index = None
for tensors in state_dict_split.filename_to_tensors.values():
shards = {tensor: state_dict[tensor] for tensor in tensors}
if state_dict_split.is_sharded:
index = {
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}
for shard_file, shard in shards.items():
torch.save(shard, os.path.join(output_dir, shard_file))

View File

@ -578,7 +578,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
if input_ids.shape[1] != 1:
for features, frames in ((image_features, 1), (video_features, num_frames)):

View File

@ -149,9 +149,10 @@ class VideoLlavaProcessor(ProcessorMixin):
if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
logger.warning_once(
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set "
"directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = "
"{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is "
"deprecated and will throw an error in v4.50."
)
# Replace the image/video tokens with the expanded token sequence
elif encoded_images is not None:

View File

@ -476,7 +476,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
logger.warning_once(
"Expanding inputs for image tokens in VipLLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
)
# prefill stage vs decoding stage (legacy behavior copied)
if input_ids.shape[1] != 1:

View File

@ -105,7 +105,6 @@ if is_torch_available():
_find_disjoint,
_find_identical,
dtype_byte_size,
shard_checkpoint,
)
from transformers.pytorch_utils import isin_mps_friendly
@ -668,71 +667,6 @@ class ModelUtilsTest(TestCasePlus):
for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
def test_shard_checkpoint(self):
# This is the model we will use, total size 340,000 bytes.
model = torch.nn.Sequential(
torch.nn.Linear(100, 200, bias=False), # size 80,000
torch.nn.Linear(200, 200, bias=False), # size 160,000
torch.nn.Linear(200, 100, bias=False), # size 80,000
torch.nn.Linear(100, 50, bias=False), # size 20,000
)
state_dict = model.state_dict()
with self.subTest("No shard when max size is bigger than model size"):
shards, index = shard_checkpoint(state_dict)
self.assertIsNone(index)
self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
with self.subTest("Test sharding, no weights bigger than max size"):
shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
# Split is first two layers then last two.
self.assertDictEqual(
index,
{
"metadata": {"total_size": 340000},
"weight_map": {
"0.weight": "pytorch_model-00001-of-00002.bin",
"1.weight": "pytorch_model-00001-of-00002.bin",
"2.weight": "pytorch_model-00002-of-00002.bin",
"3.weight": "pytorch_model-00002-of-00002.bin",
},
},
)
shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
self.assertDictEqual(
shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
)
with self.subTest("Test sharding with weights bigger than max size"):
shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
# Split is first layer, second layer then last 2.
self.assertDictEqual(
index,
{
"metadata": {"total_size": 340000},
"weight_map": {
"0.weight": "pytorch_model-00001-of-00003.bin",
"1.weight": "pytorch_model-00002-of-00003.bin",
"2.weight": "pytorch_model-00003-of-00003.bin",
"3.weight": "pytorch_model-00003-of-00003.bin",
},
},
)
shard1 = {"0.weight": state_dict["0.weight"]}
shard2 = {"1.weight": state_dict["1.weight"]}
shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
self.assertDictEqual(
shards,
{
"pytorch_model-00001-of-00003.bin": shard1,
"pytorch_model-00002-of-00003.bin": shard2,
"pytorch_model-00003-of-00003.bin": shard3,
},
)
def test_checkpoint_sharding_local_bin(self):
model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")