From 8e5d1619b3e57367701d74647e87b95f8dba5409 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 27 Jun 2023 14:45:40 -0400 Subject: [PATCH] Clean load keys (#24505) * Preliminary work on some models * Fix test load missing and make sure nonpersistent buffers are tested * Always ignore nonpersistent buffers if in state_dict * Treat models * More models * Treat remaining models * Fix quality * Fix tests * Remove draft * This test is not needed anymore * Fix copies * Fix last test * Newly added models * Fix last tests * Address review comments --- src/transformers/modeling_utils.py | 32 +++++-- .../models/albert/modeling_albert.py | 20 +--- .../models/align/modeling_align.py | 5 +- .../models/altclip/modeling_altclip.py | 7 +- src/transformers/models/bart/modeling_bart.py | 13 +-- src/transformers/models/beit/modeling_beit.py | 2 +- src/transformers/models/bert/modeling_bert.py | 14 +-- .../modeling_bert_generation.py | 6 +- .../models/big_bird/modeling_big_bird.py | 13 +-- .../modeling_bigbird_pegasus.py | 11 +-- .../models/biogpt/modeling_biogpt.py | 1 - .../models/blenderbot/modeling_blenderbot.py | 11 +-- .../modeling_blenderbot_small.py | 11 +-- src/transformers/models/blip/modeling_blip.py | 7 +- .../models/blip/modeling_blip_text.py | 8 +- .../models/blip_2/modeling_blip_2.py | 6 -- .../models/bloom/modeling_bloom.py | 13 --- .../bridgetower/modeling_bridgetower.py | 8 +- .../models/camembert/modeling_camembert.py | 36 +------ .../models/canine/modeling_canine.py | 5 +- .../chinese_clip/modeling_chinese_clip.py | 7 +- src/transformers/models/clap/modeling_clap.py | 7 +- src/transformers/models/clip/modeling_clip.py | 7 +- .../models/clipseg/modeling_clipseg.py | 7 +- .../models/codegen/modeling_codegen.py | 2 +- .../models/convbert/modeling_convbert.py | 17 +--- .../models/cpmant/modeling_cpmant.py | 2 - src/transformers/models/ctrl/modeling_ctrl.py | 1 - .../data2vec/modeling_data2vec_audio.py | 1 - .../models/data2vec/modeling_data2vec_text.py | 37 +------ .../data2vec/modeling_data2vec_vision.py | 2 +- .../models/deberta/modeling_deberta.py | 11 +-- .../models/deberta_v2/modeling_deberta_v2.py | 11 +-- .../modeling_decision_transformer.py | 4 - .../modeling_deformable_detr.py | 1 - src/transformers/models/deta/modeling_deta.py | 1 - .../models/distilbert/modeling_distilbert.py | 1 - src/transformers/models/dpr/modeling_dpr.py | 5 - .../models/electra/modeling_electra.py | 8 +- .../models/ernie/modeling_ernie.py | 14 +-- .../models/ernie_m/modeling_ernie_m.py | 1 - src/transformers/models/esm/modeling_esm.py | 14 +-- .../models/flaubert/modeling_flaubert.py | 4 - .../models/flava/modeling_flava.py | 10 +- src/transformers/models/fnet/modeling_fnet.py | 7 +- src/transformers/models/fsmt/modeling_fsmt.py | 10 -- .../models/funnel/modeling_funnel.py | 1 - src/transformers/models/git/modeling_git.py | 7 +- src/transformers/models/gpt2/modeling_gpt2.py | 13 --- .../gpt_bigcode/modeling_gpt_bigcode.py | 5 - .../models/gpt_neo/modeling_gpt_neo.py | 14 +-- .../models/gpt_neox/modeling_gpt_neox.py | 8 +- .../modeling_gpt_neox_japanese.py | 1 - src/transformers/models/gptj/modeling_gptj.py | 5 - .../modeling_gptsan_japanese.py | 1 - .../models/graphormer/modeling_graphormer.py | 1 - .../models/groupvit/modeling_groupvit.py | 5 +- .../models/hubert/modeling_hubert.py | 1 - .../models/ibert/modeling_ibert.py | 18 +--- .../models/imagegpt/modeling_imagegpt.py | 8 +- .../models/jukebox/modeling_jukebox.py | 3 - .../models/layoutlm/modeling_layoutlm.py | 10 +- .../models/layoutlmv2/modeling_layoutlmv2.py | 10 +- .../models/layoutlmv3/modeling_layoutlmv3.py | 14 +-- src/transformers/models/led/modeling_led.py | 12 +-- .../models/levit/modeling_levit.py | 8 +- src/transformers/models/lilt/modeling_lilt.py | 23 +---- .../models/llama/modeling_llama.py | 3 - .../models/longformer/modeling_longformer.py | 9 -- .../models/longt5/modeling_longt5.py | 10 -- src/transformers/models/luke/modeling_luke.py | 13 --- .../models/lxmert/modeling_lxmert.py | 1 - .../models/m2m_100/modeling_m2m_100.py | 21 +--- .../models/marian/modeling_marian.py | 12 +-- .../models/markuplm/modeling_markuplm.py | 7 +- .../models/mbart/modeling_mbart.py | 13 +-- .../models/mctct/modeling_mctct.py | 5 +- src/transformers/models/mega/modeling_mega.py | 33 ------- .../megatron_bert/modeling_megatron_bert.py | 16 +--- .../models/mobilebert/modeling_mobilebert.py | 20 +--- .../models/mpnet/modeling_mpnet.py | 18 +--- src/transformers/models/mt5/modeling_mt5.py | 40 +------- src/transformers/models/mvp/modeling_mvp.py | 10 +- .../models/nezha/modeling_nezha.py | 10 +- .../models/nllb_moe/modeling_nllb_moe.py | 21 +--- .../nystromformer/modeling_nystromformer.py | 6 +- .../models/open_llama/modeling_open_llama.py | 3 - .../models/openai/modeling_openai.py | 9 +- src/transformers/models/opt/modeling_opt.py | 6 -- .../models/owlvit/modeling_owlvit.py | 7 +- .../models/pegasus/modeling_pegasus.py | 12 +-- .../models/pegasus_x/modeling_pegasus_x.py | 9 -- .../models/pix2struct/modeling_pix2struct.py | 8 -- .../models/plbart/modeling_plbart.py | 12 +-- .../models/prophetnet/modeling_prophetnet.py | 7 -- .../models/qdqbert/modeling_qdqbert.py | 13 +-- src/transformers/models/rag/modeling_rag.py | 1 - .../models/realm/modeling_realm.py | 9 +- .../models/reformer/modeling_reformer.py | 13 ++- .../models/rembert/modeling_rembert.py | 6 +- .../models/roberta/modeling_roberta.py | 37 +------ .../modeling_roberta_prelayernorm.py | 37 +------ .../models/roc_bert/modeling_roc_bert.py | 14 +-- .../models/roformer/modeling_roformer.py | 7 -- src/transformers/models/sam/modeling_sam.py | 1 - src/transformers/models/sew/modeling_sew.py | 1 - .../models/sew_d/modeling_sew_d.py | 1 - .../speech_to_text/modeling_speech_to_text.py | 11 --- .../modeling_speech_to_text_2.py | 1 - .../models/speecht5/modeling_speecht5.py | 21 +--- .../models/splinter/modeling_splinter.py | 5 +- .../squeezebert/modeling_squeezebert.py | 10 +- .../modeling_switch_transformers.py | 7 -- src/transformers/models/t5/modeling_t5.py | 23 +---- .../models/tapas/modeling_tapas.py | 1 - .../modeling_trajectory_transformer.py | 1 + .../models/transfo_xl/modeling_transfo_xl.py | 3 - .../models/trocr/modeling_trocr.py | 1 - .../models/unispeech/modeling_unispeech.py | 1 - .../unispeech_sat/modeling_unispeech_sat.py | 1 - src/transformers/models/vilt/modeling_vilt.py | 7 +- .../visual_bert/modeling_visual_bert.py | 7 +- .../models/wav2vec2/modeling_wav2vec2.py | 1 - .../modeling_wav2vec2_conformer.py | 1 - .../models/wavlm/modeling_wavlm.py | 1 - .../models/whisper/modeling_whisper.py | 10 -- .../models/x_clip/modeling_x_clip.py | 7 +- src/transformers/models/xglm/modeling_xglm.py | 8 -- src/transformers/models/xlm/modeling_xlm.py | 7 +- .../xlm_prophetnet/modeling_xlm_prophetnet.py | 7 -- .../xlm_roberta/modeling_xlm_roberta.py | 37 +------ .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 37 +------ .../models/xlnet/modeling_xlnet.py | 1 - src/transformers/models/xmod/modeling_xmod.py | 38 +------- src/transformers/models/yoso/modeling_yoso.py | 10 +- tests/models/roberta/test_modeling_roberta.py | 21 ---- tests/test_modeling_common.py | 96 +++++++++++++------ tests/test_modeling_utils.py | 8 +- 138 files changed, 320 insertions(+), 1140 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2f4e755131e..c6a8661af41 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -320,8 +320,9 @@ def shard_checkpoint( weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split. - if last_block_size + weight_size > max_shard_size: + # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one + # weight in the current shard. + if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0: sharded_state_dicts.append({}) last_block_size = 0 @@ -3044,15 +3045,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix expected_keys = [".".join([prefix, s]) for s in expected_keys] missing_keys = list(set(expected_keys) - set(loaded_keys)) - unexpected_keys = list(set(loaded_keys) - set(expected_keys)) + unexpected_keys = set(loaded_keys) - set(expected_keys) + # Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model + # buffers + model_buffers = {n for n, _ in model.named_buffers()} + if remove_prefix_from_model: + model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers} + elif add_prefix_to_model: + model_buffers = {".".join([prefix, key]) for key in model_buffers} + unexpected_keys = list(unexpected_keys - model_buffers) - if is_accelerate_available(): - model.tie_weights() - tied_params = find_tied_parameters(model) - else: - tied_params = [] + model.tie_weights() + ptrs = collections.defaultdict(list) + for name, tensor in model.state_dict().items(): + id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor) + ptrs[id_tensor].append(name) + + # These are all the pointers of shared tensors. + tied_params = [names for _, names in ptrs.items() if len(names) > 1] for group in tied_params: + if remove_prefix_from_model: + group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group] + elif add_prefix_to_model: + group = [".".join([prefix, key]) for key in group] missing_in_group = [k for k in missing_keys if k in group] if len(missing_in_group) > 0 and len(missing_in_group) < len(group): missing_keys = [k for k in missing_keys if k not in missing_in_group] diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 2a2f6d6ef53..7196e14be29 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -208,7 +208,9 @@ class AlbertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -507,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel): config_class = AlbertConfig load_tf_weights = load_tf_weights_in_albert base_model_prefix = "albert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights.""" @@ -760,11 +761,6 @@ class AlbertModel(AlbertPreTrainedModel): ) class AlbertForPreTraining(AlbertPreTrainedModel): _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] - _keys_to_ignore_on_load_missing = [ - "predictions.decoder.weight", - "predictions.decoder.bias", - "embeddings.position_ids", - ] def __init__(self, config: AlbertConfig): super().__init__(config) @@ -912,13 +908,7 @@ class AlbertSOPHead(nn.Module): ALBERT_START_DOCSTRING, ) class AlbertForMaskedLM(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] - _keys_to_ignore_on_load_missing = [ - "predictions.decoder.weight", - "predictions.decoder.bias", - "embeddings.position_ids", - ] def __init__(self, config): super().__init__(config) @@ -1133,8 +1123,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ALBERT_START_DOCSTRING, ) class AlbertForTokenClassification(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config: AlbertConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1218,8 +1206,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ALBERT_START_DOCSTRING, ) class AlbertForQuestionAnswering(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config: AlbertConfig): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 09ee6eca626..a7d31775cf4 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -687,7 +687,9 @@ class AlignTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1176,7 +1178,6 @@ class AlignPreTrainedModel(PreTrainedModel): config_class = AlignConfig base_model_prefix = "align" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 26b3f592808..fe2754cac80 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -216,7 +216,9 @@ class AltRobertaEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1016,7 +1018,7 @@ class AltCLIPVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -1038,7 +1040,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel): config_class = AltCLIPConfig base_model_prefix = "altclip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index f426956594d..ad4d4ab9c98 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -506,7 +506,7 @@ class BartPretrainedModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] + _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"] _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -1170,7 +1170,6 @@ class BartDecoder(BartPretrainedModel): BART_START_DOCSTRING, ) class BartModel(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BartConfig): @@ -1300,12 +1299,7 @@ class BartModel(BartPretrainedModel): class BartForConditionalGeneration(BartPretrainedModel): base_model_prefix = "model" _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - _keys_to_ignore_on_load_missing = [ - "final_logits_bias", - "lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] def __init__(self, config: BartConfig): super().__init__(config) @@ -1478,7 +1472,6 @@ class BartForConditionalGeneration(BartPretrainedModel): BART_START_DOCSTRING, ) class BartForSequenceClassification(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BartConfig, **kwargs): @@ -1609,7 +1602,6 @@ class BartForSequenceClassification(BartPretrainedModel): BART_START_DOCSTRING, ) class BartForQuestionAnswering(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -1748,7 +1740,6 @@ class BartDecoderWrapper(BartPretrainedModel): BART_START_DOCSTRING, ) class BartForCausalLM(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index b17721fb2bc..d698cff88b1 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -459,7 +459,7 @@ class BeitRelativePositionBias(nn.Module): relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", relative_position_index) + self.register_buffer("relative_position_index", relative_position_index, persistent=False) def forward(self) -> torch.Tensor: relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index fb92a0e84cc..17667e8443d 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -192,7 +192,9 @@ class BertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1053,7 +1054,6 @@ class BertModel(BertPreTrainedModel): BERT_START_DOCSTRING, ) class BertForPreTraining(BertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1160,8 +1160,6 @@ class BertForPreTraining(BertPreTrainedModel): """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING ) class BertLMHeadModel(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1301,8 +1299,6 @@ class BertLMHeadModel(BertPreTrainedModel): @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1715,8 +1711,6 @@ class BertForMultipleChoice(BertPreTrainedModel): BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1800,8 +1794,6 @@ class BertForTokenClassification(BertPreTrainedModel): BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index f92b7a0633e..3f4a26da459 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -556,7 +556,9 @@ class BertGenerationEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0): if input_ids is not None: @@ -588,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel): config_class = BertGenerationConfig base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -860,7 +861,6 @@ class BertGenerationOnlyLMHead(nn.Module): BERT_GENERATION_START_DOCSTRING, ) class BertGenerationDecoder(BertGenerationPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index e1346a23c9d..a2db2e2638f 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -257,7 +257,9 @@ class BigBirdEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1765,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_big_bird base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -2261,7 +2262,6 @@ class BigBirdModel(BigBirdPreTrainedModel): class BigBirdForPreTraining(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -2368,7 +2368,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel): @add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) class BigBirdForMaskedLM(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -2513,12 +2512,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): """BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING ) class BigBirdForCausalLM(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"predictions.decoder.bias", - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index d7683d6fcf8..fe43c1e68e2 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2358,7 +2358,6 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BigBirdPegasusConfig): @@ -2491,12 +2490,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): base_model_prefix = "model" _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - _keys_to_ignore_on_load_missing = [ - "final_logits_bias", - "lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] def __init__(self, config: BigBirdPegasusConfig): super().__init__(config) @@ -2669,7 +2663,6 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BigBirdPegasusConfig, **kwargs): @@ -2799,7 +2792,6 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -2932,7 +2924,6 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel): class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 3e925917cff..7f6d44502c0 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -646,7 +646,6 @@ class BioGptModel(BioGptPreTrainedModel): """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING ) class BioGptForCausalLM(BioGptPreTrainedModel): - _keys_to_ignore_on_load_missing = ["output_projection.weight"] _tied_weights_keys = ["output_projection.weight"] def __init__(self, config): diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 8e582c4fa33..a3aaf6b4a81 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -1102,7 +1102,6 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): BLENDERBOT_START_DOCSTRING, ) class BlenderbotModel(BlenderbotPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: BlenderbotConfig): @@ -1244,14 +1243,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel): ) class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: BlenderbotConfig): @@ -1441,7 +1433,6 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill class BlenderbotForCausalLM(BlenderbotPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 890b47373e7..70794e80a43 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -1096,7 +1096,6 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): BLENDERBOT_SMALL_START_DOCSTRING, ) class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: BlenderbotSmallConfig): @@ -1226,14 +1225,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): ) class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: BlenderbotSmallConfig): @@ -1408,7 +1400,6 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 0e70333c340..115aa14e83f 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -255,7 +255,9 @@ class BlipTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -419,7 +421,6 @@ class BlipPreTrainedModel(PreTrainedModel): config_class = BlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -927,7 +928,6 @@ class BlipModel(BlipPreTrainedModel): ) class BlipForConditionalGeneration(BlipPreTrainedModel): config_class = BlipConfig - _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] main_input_name = "pixel_values" @@ -1100,7 +1100,6 @@ class BlipForConditionalGeneration(BlipPreTrainedModel): ) class BlipForQuestionAnswering(BlipPreTrainedModel): config_class = BlipConfig - _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] def __init__(self, config: BlipConfig): diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 1f269cf852e..444a7a22b6b 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -56,7 +56,9 @@ class BlipTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config @@ -552,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel): config_class = BlipTextConfig base_model_prefix = "bert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -808,9 +809,6 @@ class BlipTextModel(BlipTextPreTrainedModel): # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 class BlipTextLMHeadModel(BlipTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index b52a58d97f4..5856df2c257 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -273,12 +273,6 @@ class Blip2PreTrainedModel(PreTrainedModel): config_class = Blip2Config base_model_prefix = "blip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"language_model.encoder.embed_tokens.weight", - r"language_model.decoder.embed_tokens.weight", - r"language_model.lm_head.weight", - ] _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"] _skip_keys_device_placement = "past_key_values" _keep_in_fp32_modules = ["wo"] diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 4f6de49a144..d37972a429f 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -471,12 +471,6 @@ class BloomBlock(nn.Module): class BloomPreTrainedModel(PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - config_class = BloomConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True @@ -826,7 +820,6 @@ class BloomModel(BloomPreTrainedModel): BLOOM_START_DOCSTRING, ) class BloomForCausalLM(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: BloomConfig): @@ -995,8 +988,6 @@ class BloomForCausalLM(BloomPreTrainedModel): BLOOM_START_DOCSTRING, ) class BloomForSequenceClassification(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config: BloomConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1123,8 +1114,6 @@ class BloomForSequenceClassification(BloomPreTrainedModel): BLOOM_START_DOCSTRING, ) class BloomForTokenClassification(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config: BloomConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1226,8 +1215,6 @@ class BloomForTokenClassification(BloomPreTrainedModel): BLOOM_START_DOCSTRING, ) class BloomForQuestionAnswering(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.transformer = BloomModel(config) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 4290241fbc0..1fb3cc131bc 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -280,7 +280,7 @@ class BridgeTowerVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -880,7 +880,9 @@ class BridgeTowerTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1038,8 +1040,6 @@ class BridgeTowerTextModel(BridgeTowerPreTrainedModel): config_class = BridgeTowerTextConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index e98840fbc6d..ed3afab11aa 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -94,7 +94,9 @@ class CamembertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -627,15 +629,6 @@ class CamembertPreTrainedModel(PreTrainedModel): if isinstance(module, CamembertEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - CAMEMBERT_INPUTS_DOCSTRING = r""" Args: @@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] _no_split_modules = [] # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert @@ -935,9 +927,6 @@ class CamembertModel(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForMaskedLM(CamembertPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -952,9 +941,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel): self.roberta = CamembertModel(config, add_pooling_layer=False) self.lm_head = CamembertLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1042,8 +1028,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForSequenceClassification(CamembertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1144,8 +1128,6 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForMultipleChoice(CamembertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1241,9 +1223,6 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForTokenClassification(CamembertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1330,9 +1309,6 @@ class CamembertForTokenClassification(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForQuestionAnswering(CamembertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1431,9 +1407,6 @@ class CamembertForQuestionAnswering(CamembertPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base class CamembertForCausalLM(CamembertPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1445,9 +1418,6 @@ class CamembertForCausalLM(CamembertPreTrainedModel): self.roberta = CamembertModel(config, add_pooling_layer=False) self.lm_head = CamembertLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index a91d42f0395..b863e294bdd 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -216,7 +216,9 @@ class CanineEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int): @@ -900,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_canine base_model_prefix = "canine" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 0adf5cfdcb1..86da1c7b6a8 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -121,7 +121,9 @@ class ChineseCLIPTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -190,7 +192,7 @@ class ChineseCLIPVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -689,7 +691,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel): config_class = ChineseCLIPConfig base_model_prefix = "chinese_clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index c4dbcb03f34..0f3986ada0c 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1166,7 +1166,9 @@ class ClapTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True ) @@ -1677,7 +1679,6 @@ class ClapPreTrainedModel(PreTrainedModel): config_class = ClapConfig base_model_prefix = "clap" supports_gradient_checkpointing = False - _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"] def _init_weights(self, module): """Initialize the weights""" @@ -1781,7 +1782,6 @@ class ClapTextModel(ClapPreTrainedModel): """ config_class = ClapTextConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText def __init__(self, config, add_pooling_layer=True): @@ -1936,7 +1936,6 @@ class ClapTextModel(ClapPreTrainedModel): @add_start_docstrings(CLAP_START_DOCSTRING) class ClapModel(ClapPreTrainedModel): config_class = ClapConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] def __init__(self, config: ClapConfig): super().__init__(config) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ee9d660ef71..487f756d3ff 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -188,7 +188,7 @@ class CLIPVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -210,7 +210,9 @@ class CLIPTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -410,7 +412,6 @@ class CLIPPreTrainedModel(PreTrainedModel): config_class = CLIPConfig base_model_prefix = "clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 85b11965306..b1d120e365a 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -181,7 +181,7 @@ class CLIPSegVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def interpolate_position_embeddings(self, new_size): if len(new_size) != 2: @@ -230,7 +230,9 @@ class CLIPSegTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel): config_class = CLIPSegConfig base_model_prefix = "clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 8b1d34f59e7..4b87800cb1b 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -83,6 +83,7 @@ class CodeGenAttention(nn.Module): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) self.attn_dropout = nn.Dropout(config.attn_pdrop) @@ -600,7 +601,6 @@ class CodeGenModel(CodeGenPreTrainedModel): CODEGEN_START_DOCSTRING, ) class CodeGenForCausalLM(CodeGenPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index bbdba210c23..a3910e20dbe 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -191,7 +191,9 @@ class ConvBertEmbeddings(nn.Module): self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -245,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_convbert base_model_prefix = "convbert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"] def _init_weights(self, module): """Initialize the weights""" @@ -765,8 +765,6 @@ CONVBERT_INPUTS_DOCSTRING = r""" CONVBERT_START_DOCSTRING, ) class ConvBertModel(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.embeddings = ConvBertEmbeddings(config) @@ -880,7 +878,6 @@ class ConvBertGeneratorPredictions(nn.Module): @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) class ConvBertForMaskedLM(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"] _tied_weights_keys = ["generator.lm_head.weight"] def __init__(self, config): @@ -992,8 +989,6 @@ class ConvBertClassificationHead(nn.Module): CONVBERT_START_DOCSTRING, ) class ConvBertForSequenceClassification(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1089,8 +1084,6 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel): CONVBERT_START_DOCSTRING, ) class ConvBertForMultipleChoice(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) @@ -1184,8 +1177,6 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel): CONVBERT_START_DOCSTRING, ) class ConvBertForTokenClassification(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1267,8 +1258,6 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel): CONVBERT_START_DOCSTRING, ) class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index 33ead6a1046..808a341ac99 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -537,7 +537,6 @@ class CpmAntPreTrainedModel(PreTrainedModel): config_class = CpmAntConfig base_model_prefix = "cpmant" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -749,7 +748,6 @@ class CpmAntModel(CpmAntPreTrainedModel): CPMANT_START_DOCSTRING, ) class CpmAntForCausalLM(CpmAntPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: CpmAntConfig): diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index dadcbb494cf..7cf5168e74b 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -509,7 +509,6 @@ class CTRLModel(CTRLPreTrainedModel): CTRL_START_DOCSTRING, ) class CTRLLMHeadModel(CTRLPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 76b6b4d485f..a42fb5eb067 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -689,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel): config_class = Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 206fe1603b0..4c07acd1107 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -80,7 +80,9 @@ class Data2VecTextForTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -615,15 +617,6 @@ class Data2VecTextPreTrainedModel(PreTrainedModel): if isinstance(module, Data2VecTextEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - DATA2VECTEXT_START_DOCSTRING = r""" Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and @@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -883,9 +874,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel): """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING ) class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -897,9 +885,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel): self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False) self.lm_head = Data2VecTextLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1038,9 +1023,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel): @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING) class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1055,9 +1037,6 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel): self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False) self.lm_head = Data2VecTextLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1174,8 +1153,6 @@ class Data2VecTextLMHead(nn.Module): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1273,8 +1250,6 @@ class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1369,9 +1344,6 @@ class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1478,9 +1450,6 @@ class Data2VecTextClassificationHead(nn.Module): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 77b42435489..f8fe59587af 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -470,7 +470,7 @@ class Data2VecVisionRelativePositionBias(nn.Module): relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", relative_position_index) + self.register_buffer("relative_position_index", relative_position_index, persistent=False) def forward(self) -> torch.Tensor: relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 9a0d43db3a0..c946592730e 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -764,7 +764,9 @@ class DebertaEmbeddings(nn.Module): self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): if input_ids is not None: @@ -821,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel): config_class = DebertaConfig base_model_prefix = "deberta" - _keys_to_ignore_on_load_missing = ["position_ids"] _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True @@ -1020,8 +1021,6 @@ class DebertaModel(DebertaPreTrainedModel): @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaForMaskedLM(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1277,8 +1276,6 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel): DEBERTA_START_DOCSTRING, ) class DebertaForTokenClassification(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1352,8 +1349,6 @@ class DebertaForTokenClassification(DebertaPreTrainedModel): DEBERTA_START_DOCSTRING, ) class DebertaForQuestionAnswering(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 1596ad4ffad..608bca00958 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -862,7 +862,9 @@ class DebertaV2Embeddings(nn.Module): self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): if input_ids is not None: @@ -920,7 +922,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel): config_class = DebertaV2Config base_model_prefix = "deberta" - _keys_to_ignore_on_load_missing = ["position_ids"] _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True @@ -1120,8 +1121,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel): @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1380,8 +1379,6 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): ) # Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1455,8 +1452,6 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): DEBERTA_START_DOCSTRING, ) class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 926947b1617..064b3cb0ad7 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -476,8 +476,6 @@ class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel): class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config): super().__init__(config) @@ -747,8 +745,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel): base_model_prefix = "decision_transformer" main_input_name = "states" supports_gradient_checkpointing = False - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 6469cf7a65d..cdeb3c79622 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1823,7 +1823,6 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel): ) class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] def __init__(self, config: DeformableDetrConfig): diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index bee84a5bf72..b5fe0ea8a8e 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -1775,7 +1775,6 @@ class DetaModel(DetaPreTrainedModel): ) class DetaForObjectDetection(DetaPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] _tied_weights_keys = [r"bbox_embed\.\d+"] # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 8b71c086bbc..97300dec2d6 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -595,7 +595,6 @@ class DistilBertModel(DistilBertPreTrainedModel): DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["vocab_projector.weight"] _tied_weights_keys = ["vocab_projector.weight"] def __init__(self, config: PretrainedConfig): diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index a551e507300..588440d4a6c 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -296,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "ctx_encoder" - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): @@ -309,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "question_encoder" - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] class DPRPretrainedReader(DPRPreTrainedModel): @@ -322,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "span_predictor" - _keys_to_ignore_on_load_missing = [r"position_ids"] ############### diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index a7ee4ec9320..23ca78e8e06 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -161,7 +161,9 @@ class ElectraEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_electra base_model_prefix = "electra" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): @@ -1166,7 +1166,6 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ELECTRA_START_DOCSTRING, ) class ElectraForMaskedLM(ElectraPreTrainedModel): - _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"] _tied_weights_keys = ["generator_lm_head.weight"] def __init__(self, config): @@ -1534,7 +1533,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): """ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING ) class ElectraForCausalLM(ElectraPreTrainedModel): - _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"] _tied_weights_keys = ["generator_lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index b8df1b2d503..79b3c00280b 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -89,7 +89,9 @@ class ErnieEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel): config_class = ErnieConfig base_model_prefix = "ernie" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -983,7 +984,6 @@ class ErnieModel(ErniePreTrainedModel): ERNIE_START_DOCSTRING, ) class ErnieForPreTraining(ErniePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie @@ -1095,8 +1095,6 @@ class ErnieForPreTraining(ErniePreTrainedModel): """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING ) class ErnieForCausalLM(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie @@ -1243,8 +1241,6 @@ class ErnieForCausalLM(ErniePreTrainedModel): @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING) class ErnieForMaskedLM(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie @@ -1665,8 +1661,6 @@ class ErnieForMultipleChoice(ErniePreTrainedModel): ERNIE_START_DOCSTRING, ) class ErnieForTokenClassification(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1746,8 +1740,6 @@ class ErnieForTokenClassification(ErniePreTrainedModel): ERNIE_START_DOCSTRING, ) class ErnieForQuestionAnswering(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/ernie_m/modeling_ernie_m.py index 6d995cf84cb..82e40239491 100755 --- a/src/transformers/models/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/ernie_m/modeling_ernie_m.py @@ -412,7 +412,6 @@ class ErnieMPreTrainedModel(PreTrainedModel): config_class = ErnieMConfig base_model_prefix = "ernie_m" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index e0b26e0f781..43ff7d7b52b 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -96,7 +96,7 @@ class RotaryEmbedding(torch.nn.Module): # Generate and save the inverse frequency buffer (non trainable) inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) inv_freq = inv_freq - self.register_buffer("inv_freq", inv_freq) + self.register_buffer("inv_freq", inv_freq, persistent=False) self._seq_len_cached = None self._cos_cached = None @@ -178,7 +178,9 @@ class EsmEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -783,7 +785,6 @@ class EsmModel(EsmPreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = False def __init__(self, config, add_pooling_layer=True): @@ -960,8 +961,6 @@ class EsmModel(EsmPreTrainedModel): @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING) class EsmForMaskedLM(EsmPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight"] def __init__(self, config): @@ -1081,8 +1080,6 @@ class EsmLMHead(nn.Module): ESM_START_DOCSTRING, ) class EsmForSequenceClassification(EsmPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1177,9 +1174,6 @@ class EsmForSequenceClassification(EsmPreTrainedModel): ESM_START_DOCSTRING, ) class EsmForTokenClassification(EsmPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 1b04da24103..318e9bfd471 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -378,8 +378,6 @@ class FlaubertPreTrainedModel(PreTrainedModel): class FlaubertModel(FlaubertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) @@ -448,7 +446,6 @@ class FlaubertModel(FlaubertPreTrainedModel): # Initialize weights and apply final processing self.post_init() - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) @@ -654,7 +651,6 @@ class FlaubertModel(FlaubertPreTrainedModel): ) # Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert class FlaubertWithLMHeadModel(FlaubertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"] _tied_weights_keys = ["pred_layer.proj.weight"] def __init__(self, config): diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 5d49197f8ca..d986a17b750 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -387,7 +387,9 @@ class FlavaTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1724,12 +1726,6 @@ class FlavaGlobalContrastiveHead(nn.Module): ) class FlavaForPreTraining(FlavaPreTrainedModel): # Those are linked to xxx.bias - _keys_to_ignore_on_load_missing = [ - "mmm_text_head.decoder.bias", - "mmm_image_head.decoder.bias", - "mlm_head.decoder.bias", - "mim_head.decoder.bias", - ] _tied_weights_keys = [ "mmm_text_head.decoder.bias", "mmm_image_head.decoder.bias", diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 6bc526eeebc..45042147761 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -114,7 +114,9 @@ class FNetEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -411,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel): config_class = FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -621,7 +622,6 @@ class FNetModel(FNetPreTrainedModel): FNET_START_DOCSTRING, ) class FNetForPreTraining(FNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -716,7 +716,6 @@ class FNetForPreTraining(FNetPreTrainedModel): @add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING) class FNetForMaskedLM(FNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 255cf91df76..608efabf788 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -1034,7 +1034,6 @@ def _get_shape(t): FSMT_START_DOCSTRING, ) class FSMTModel(PretrainedFSMTModel): - _keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight"] def __init__(self, config: FSMTConfig): @@ -1172,15 +1171,6 @@ class FSMTModel(PretrainedFSMTModel): ) class FSMTForConditionalGeneration(PretrainedFSMTModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - "model.encoder.embed_positions.weight", - "model.decoder.embed_positions.weight", - "decoder.output_projection.weight", - ] - _keys_to_ignore_on_save = [ - "model.encoder.embed_positions.weight", - "model.decoder.embed_positions.weight", - ] _tied_weights_keys = ["model.decoder.embed_tokens.weight"] def __init__(self, config: FSMTConfig): diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 805b651f212..0ee9ed587ed 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -1190,7 +1190,6 @@ class FunnelForPreTraining(FunnelPreTrainedModel): @add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING) class FunnelForMaskedLM(FunnelPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: FunnelConfig) -> None: diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 23ae6d64962..89696694ff4 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -109,7 +109,9 @@ class GitEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel): config_class = GitConfig base_model_prefix = "git" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -623,7 +624,7 @@ class GitVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index b9a8568f00e..58b419897a7 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -668,9 +668,6 @@ DEPARALLELIZE_DOCSTRING = r""" GPT2_START_DOCSTRING, ) class GPT2Model(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] - def __init__(self, config): super().__init__(config) @@ -957,8 +954,6 @@ class GPT2Model(GPT2PreTrainedModel): GPT2_START_DOCSTRING, ) class GPT2LMHeadModel(GPT2PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1151,8 +1146,6 @@ input sequence). GPT2_START_DOCSTRING, ) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1381,9 +1374,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): GPT2_START_DOCSTRING, ) class GPT2ForSequenceClassification(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1605,9 +1595,6 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel): GPT2_START_DOCSTRING, ) class GPT2ForQuestionAnswering(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 705d07b1da2..a45b9bd4b26 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -500,8 +500,6 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r""" GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeModel(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config): super().__init__(config) self.multi_query = config.multi_query @@ -722,7 +720,6 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel): GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -876,8 +873,6 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel): GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index b67f4ddbfac..66471b6eac2 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -145,8 +145,8 @@ class GPTNeoSelfAttention(nn.Module): if attention_type == "local": bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size)) - self.register_buffer("bias", bias) - self.register_buffer("masked_bias", torch.tensor(-1e9)) + self.register_buffer("bias", bias, persistent=False) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.attn_dropout = nn.Dropout(float(config.attention_dropout)) self.resid_dropout = nn.Dropout(float(config.resid_dropout)) @@ -663,12 +663,6 @@ class GPTNeoModel(GPTNeoPreTrainedModel): GPT_NEO_START_DOCSTRING, ) class GPTNeoForCausalLM(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"h\.\d+\.attn\.masked_bias", - r"lm_head.weight", - r"h\.\d+\.attn\.attention\.bias", - ] - _keys_to_ignore_on_save = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -820,8 +814,6 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel): GPT_NEO_START_DOCSTRING, ) class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1025,8 +1017,6 @@ class GPTNeoForTokenClassification(GPTNeoPreTrainedModel): GPT_NEO_START_DOCSTRING, ) class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 7c3bfd1035f..841cbe1aa8f 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -100,8 +100,9 @@ class GPTNeoXAttention(nn.Module): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) - self.register_buffer("masked_bias", torch.tensor(-1e9)) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.rotary_emb = RotaryEmbedding( self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) @@ -600,7 +601,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING ) class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["embed_out.weight"] def __init__(self, config): @@ -775,8 +775,6 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): GPT_NEOX_START_DOCSTRING, ) class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -971,8 +969,6 @@ class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel): GPT_NEOX_START_DOCSTRING, ) class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index 1671e5916ef..e7cb510e622 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -591,7 +591,6 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel): GPT_NEOX_JAPANESE_START_DOCSTRING, ) class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"] _tied_weights_keys = ["embed_out.weight"] def __init__(self, config): diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index de120167989..e9a9045a6d2 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -734,7 +734,6 @@ class GPTJModel(GPTJPreTrainedModel): GPTJ_START_DOCSTRING, ) class GPTJForCausalLM(GPTJPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -933,8 +932,6 @@ class GPTJForCausalLM(GPTJPreTrainedModel): GPTJ_START_DOCSTRING, ) class GPTJForSequenceClassification(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1059,8 +1056,6 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel): GPTJ_START_DOCSTRING, ) class GPTJForQuestionAnswering(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py index 8c1cdd0b1a5..f02aa2dc839 100644 --- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py @@ -1111,7 +1111,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel): GPTSAN_JAPANESE_START_DOCSTRING, ) class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: GPTSanJapaneseConfig): diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py index 2dd86b7b55f..82ffd4b1637 100755 --- a/src/transformers/models/graphormer/modeling_graphormer.py +++ b/src/transformers/models/graphormer/modeling_graphormer.py @@ -714,7 +714,6 @@ class GraphormerPreTrainedModel(PreTrainedModel): config_class = GraphormerConfig base_model_prefix = "graphormer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] main_input_name_nodes = "input_nodes" main_input_name_edges = "input_edges" diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index c19ebd13b91..9c312c0ff81 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -450,7 +450,9 @@ class GroupViTTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -767,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel): config_class = GroupViTConfig base_model_prefix = "groupvit" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index af3d4e2d0ac..8228520dfd5 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -869,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel): base_model_prefix = "hubert" main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 7f300e01ae4..6cf484d96f7 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -80,7 +80,9 @@ class IBertEmbeddings(nn.Module): ) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy @@ -740,8 +742,6 @@ class IBertModel(IBertPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -854,8 +854,6 @@ class IBertModel(IBertPreTrainedModel): @add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING) class IBertForMaskedLM(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"] def __init__(self, config): @@ -969,8 +967,6 @@ class IBertLMHead(nn.Module): IBERT_START_DOCSTRING, ) class IBertForSequenceClassification(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1064,8 +1060,6 @@ class IBertForSequenceClassification(IBertPreTrainedModel): IBERT_START_DOCSTRING, ) class IBertForMultipleChoice(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1156,9 +1150,6 @@ class IBertForMultipleChoice(IBertPreTrainedModel): IBERT_START_DOCSTRING, ) class IBertForTokenClassification(IBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1256,9 +1247,6 @@ class IBertClassificationHead(nn.Module): IBERT_START_DOCSTRING, ) class IBertForQuestionAnswering(IBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 539119fabf2..f24cf7ae713 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -183,8 +183,9 @@ class ImageGPTAttention(nn.Module): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) - self.register_buffer("masked_bias", torch.tensor(-1e4)) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -613,8 +614,6 @@ IMAGEGPT_INPUTS_DOCSTRING = r""" IMAGEGPT_START_DOCSTRING, ) class ImageGPTModel(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config: ImageGPTConfig): super().__init__(config) @@ -893,7 +892,6 @@ class ImageGPTModel(ImageGPTPreTrainedModel): IMAGEGPT_START_DOCSTRING, ) class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: ImageGPTConfig): @@ -1085,8 +1083,6 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel): IMAGEGPT_START_DOCSTRING, ) class ImageGPTForImageClassification(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config: ImageGPTConfig): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py index f7be47c0058..236d1f4ff37 100755 --- a/src/transformers/models/jukebox/modeling_jukebox.py +++ b/src/transformers/models/jukebox/modeling_jukebox.py @@ -602,7 +602,6 @@ Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/ class JukeboxVQVAE(PreTrainedModel): config_class = JukeboxVQVAEConfig base_model_prefix = "vqvae" - _keys_to_ignore_on_load_unexpected = [r"priors"] def _init_weights(self, module): if isinstance(module, nn.Embedding): # embed_tokens @@ -1792,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel): """ config_class = JukeboxPriorConfig - _keys_to_ignore_on_load_unexpected = ["vqvae"] def _init_weights(self, module): init_scale = self.config.init_scale @@ -1832,7 +1830,6 @@ class JukeboxPrior(PreTrainedModel): self.level = level if level is not None else config.level self.base_model_prefix = f"priors.{self.level}" - self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level] self.n_ctx = config.n_ctx diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 410f7650942..26c4cd92d6e 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -68,7 +68,9 @@ class LayoutLMEmbeddings(nn.Module): self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -619,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel): pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "layoutlm" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -857,11 +858,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel): @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING) class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 5a6f39ce31a..18927fb1fde 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -77,7 +77,9 @@ class LayoutLMv2Embeddings(nn.Module): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def _calc_spatial_position_embeddings(self, bbox): try: @@ -506,7 +508,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel): config_class = LayoutLMv2Config pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "layoutlmv2" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -567,8 +568,11 @@ class LayoutLMv2VisualBackbone(nn.Module): self.register_buffer( "pixel_mean", torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1), + persistent=False, + ) + self.register_buffer( + "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False ) - self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)) self.out_feature_key = "p2" if torch.are_deterministic_algorithms_enabled(): logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`") diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index db6618caaea..1648016b574 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -245,7 +245,9 @@ class LayoutLMv3TextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -750,8 +752,6 @@ class LayoutLMv3Output(nn.Module): LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3Model(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.config = config @@ -1038,9 +1038,6 @@ class LayoutLMv3ClassificationHead(nn.Module): LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1153,9 +1150,6 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1286,8 +1280,6 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 8de14242bfc..d98c8d29672 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2209,7 +2209,6 @@ class LEDDecoder(LEDPreTrainedModel): LED_START_DOCSTRING, ) class LEDModel(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: LEDConfig): @@ -2335,14 +2334,7 @@ class LEDModel(LEDPreTrainedModel): ) class LEDForConditionalGeneration(LEDPreTrainedModel): base_model_prefix = "led" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: LEDConfig): @@ -2530,7 +2522,6 @@ class LEDForConditionalGeneration(LEDPreTrainedModel): LED_START_DOCSTRING, ) class LEDForSequenceClassification(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: LEDConfig, **kwargs): @@ -2667,7 +2658,6 @@ class LEDForSequenceClassification(LEDPreTrainedModel): LED_START_DOCSTRING, ) class LEDForQuestionAnswering(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config): diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py index e45ffa05b15..0accc28391b 100644 --- a/src/transformers/models/levit/modeling_levit.py +++ b/src/transformers/models/levit/modeling_levit.py @@ -195,7 +195,9 @@ class LevitAttention(nn.Module): self.attention_bias_cache = {} self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) - self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points)) + self.register_buffer( + "attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False + ) @torch.no_grad() def train(self, mode=True): @@ -271,7 +273,9 @@ class LevitAttentionSubsample(nn.Module): indices.append(attention_offsets[offset]) self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) - self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points)) + self.register_buffer( + "attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False + ) @torch.no_grad() def train(self, mode=True): diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index 74454d244e8..e5783b970f8 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -59,7 +59,9 @@ class LiltTextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy @@ -610,15 +612,6 @@ class LiltPreTrainedModel(PreTrainedModel): if isinstance(module, LiltEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - LILT_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -697,8 +690,6 @@ LILT_INPUTS_DOCSTRING = r""" LILT_START_DOCSTRING, ) class LiltModel(LiltPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -847,8 +838,6 @@ class LiltModel(LiltPreTrainedModel): LILT_START_DOCSTRING, ) class LiltForSequenceClassification(LiltPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) @@ -967,9 +956,6 @@ class LiltForSequenceClassification(LiltPreTrainedModel): LILT_START_DOCSTRING, ) class LiltForTokenClassification(LiltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) @@ -1096,9 +1082,6 @@ class LiltClassificationHead(nn.Module): LILT_START_DOCSTRING, ) class LiltForQuestionAnswering(LiltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index c9debdd252d..24231c3f777 100755 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -344,7 +344,6 @@ class LlamaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -784,8 +783,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel): LLAMA_START_DOCSTRING, ) class LlamaForSequenceClassification(LlamaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 665e2cb5642..994157daa87 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1421,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"position_ids"] _no_split_modules = ["LongformerSelfAttention"] def _init_weights(self, module): @@ -1770,8 +1769,6 @@ class LongformerModel(LongformerPreTrainedModel): @add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING) class LongformerForMaskedLM(LongformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder"] def __init__(self, config): @@ -1886,8 +1883,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel): LONGFORMER_START_DOCSTRING, ) class LongformerForSequenceClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2015,8 +2010,6 @@ class LongformerClassificationHead(nn.Module): LONGFORMER_START_DOCSTRING, ) class LongformerForQuestionAnswering(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2154,8 +2147,6 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel): LONGFORMER_START_DOCSTRING, ) class LongformerForTokenClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 1a49444e8a5..303755ae433 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1763,10 +1763,6 @@ num_heads)`. LONGT5_START_DOCSTRING, ) class LongT5Model(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] _keys_to_ignore_on_load_unexpected = [ r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] @@ -1917,11 +1913,6 @@ class LongT5Model(LongT5PreTrainedModel): @add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING) class LongT5ForConditionalGeneration(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _keys_to_ignore_on_load_unexpected = [ r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] @@ -2160,7 +2151,6 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel): LONGT5_START_DOCSTRING, ) class LongT5EncoderModel(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: LongT5Config): diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index ba21d3deb32..8a3ceb14d50 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -1022,8 +1022,6 @@ LUKE_INPUTS_DOCSTRING = r""" LUKE_START_DOCSTRING, ) class LukeModel(LukePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config: LukeConfig, add_pooling_layer: bool = True): super().__init__(config) self.config = config @@ -1278,17 +1276,6 @@ class LukeLMHead(nn.Module): LUKE_START_DOCSTRING, ) class LukeForMaskedLM(LukePreTrainedModel): - _keys_to_ignore_on_save = [ - r"lm_head.decoder.weight", - r"lm_head.decoder.bias", - r"entity_predictions.decoder.weight", - ] - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"lm_head.decoder.weight", - r"lm_head.decoder.bias", - r"entity_predictions.decoder.weight", - ] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 21a279ec29c..2a1a21282ec 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -1018,7 +1018,6 @@ class LxmertModel(LxmertPreTrainedModel): LXMERT_START_DOCSTRING, ) class LxmertForPreTraining(LxmertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index a9cde571f7d..20db884c636 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -131,7 +131,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module): # in forward put the weights on the correct dtype and device of the param emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) - self.register_buffer("weights", emb_weights) + self.register_buffer("weights", emb_weights, persistent=False) @staticmethod def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): @@ -1137,14 +1137,6 @@ class M2M100Decoder(M2M100PreTrainedModel): M2M_100_START_DOCSTRING, ) class M2M100Model(M2M100PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - "encoder.embed_positions.weights", - "encoder.embed_positions.bias", - "decoder.embed_positions.weights", - "decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: M2M100Config): @@ -1258,17 +1250,6 @@ class M2M100Model(M2M100PreTrainedModel): ) class M2M100ForConditionalGeneration(M2M100PreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"encoder.embed_positions.weights", - r"encoder.embed_positions.bias", - r"decoder.embed_positions.weights", - r"decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: M2M100Config): diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 1d1cbe125e6..d25d1ed4bc2 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -1103,7 +1103,6 @@ class MarianDecoder(MarianPreTrainedModel): "The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING ) class MarianModel(MarianPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MarianConfig): @@ -1292,13 +1291,9 @@ class MarianModel(MarianPreTrainedModel): class MarianMTModel(MarianPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", + "final_logits_bias", + "encoder.embed_positions.weight", + "decoder.embed_positions.weight", ] _keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] @@ -1561,7 +1556,6 @@ class MarianDecoderWrapper(MarianPreTrainedModel): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en class MarianForCausalLM(MarianPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 0c6847b4781..34435b898fc 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -143,7 +143,9 @@ class MarkupLMEmbeddings(nn.Module): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -713,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel): config_class = MarkupLMConfig pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "markuplm" - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM def _init_weights(self, module): @@ -971,8 +972,6 @@ class MarkupLMModel(MarkupLMPreTrainedModel): MARKUPLM_START_DOCSTRING, ) class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 7bf6b1b37e9..577d950c932 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1156,7 +1156,6 @@ class MBartDecoder(MBartPreTrainedModel): MBART_START_DOCSTRING, ) class MBartModel(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MBartConfig): @@ -1277,14 +1276,7 @@ class MBartModel(MBartPreTrainedModel): ) class MBartForConditionalGeneration(MBartPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: MBartConfig): @@ -1452,7 +1444,6 @@ class MBartForConditionalGeneration(MBartPreTrainedModel): MBART_START_DOCSTRING, ) class MBartForSequenceClassification(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] def __init__(self, config: MBartConfig, **kwargs): @@ -1582,7 +1573,6 @@ class MBartForSequenceClassification(MBartPreTrainedModel): MBART_START_DOCSTRING, ) class MBartForQuestionAnswering(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] def __init__(self, config): @@ -1716,7 +1706,6 @@ class MBartDecoderWrapper(MBartPreTrainedModel): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25 class MBartForCausalLM(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py index 7f2de9f952a..4b965b27ec1 100755 --- a/src/transformers/models/mctct/modeling_mctct.py +++ b/src/transformers/models/mctct/modeling_mctct.py @@ -149,7 +149,9 @@ class MCTCTEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -443,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel): config_class = MCTCTConfig base_model_prefix = "mctct" main_input_name = "input_features" - _keys_to_ignore_on_load_missing = ["position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py index 19e1cc10750..9381e60905c 100644 --- a/src/transformers/models/mega/modeling_mega.py +++ b/src/transformers/models/mega/modeling_mega.py @@ -1387,15 +1387,6 @@ class MegaPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - MEGA_START_DOCSTRING = r""" @@ -1474,8 +1465,6 @@ class MegaModel(MegaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [] - def __init__(self, config: MegaConfig, add_pooling_layer=True): super().__init__(config) self.config = config @@ -1656,9 +1645,6 @@ class MegaModel(MegaPreTrainedModel): """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING ) class MegaForCausalLM(MegaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"] - _keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: MegaConfig): @@ -1678,9 +1664,6 @@ class MegaForCausalLM(MegaPreTrainedModel): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1821,9 +1804,6 @@ class MegaForCausalLM(MegaPreTrainedModel): @add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING) class MegaForMaskedLM(MegaPreTrainedModel): - _keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"] - _keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["mlm_head.weight"] def __init__(self, config: MegaConfig): @@ -1845,9 +1825,6 @@ class MegaForMaskedLM(MegaPreTrainedModel): self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size) self.dropout = nn.Dropout(config.dropout_prob) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["mlm_head.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1931,8 +1908,6 @@ class MegaForMaskedLM(MegaPreTrainedModel): MEGA_START_DOCSTRING, ) class MegaForSequenceClassification(MegaPreTrainedModel): - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2024,8 +1999,6 @@ class MegaForSequenceClassification(MegaPreTrainedModel): MEGA_START_DOCSTRING, ) class MegaForMultipleChoice(MegaPreTrainedModel): - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) @@ -2111,9 +2084,6 @@ class MegaForMultipleChoice(MegaPreTrainedModel): MEGA_START_DOCSTRING, ) class MegaForTokenClassification(MegaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2214,9 +2184,6 @@ class MegaClassificationHead(nn.Module): MEGA_START_DOCSTRING, ) class MegaForQuestionAnswering(MegaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index bba7e7369cb..c28b681326c 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -149,7 +149,9 @@ class MegatronBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( @@ -713,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_megatron_bert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1014,7 +1015,6 @@ class MegatronBertModel(MegatronBertPreTrainedModel): MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForPreTraining(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config, add_binary_head=True): @@ -1121,8 +1121,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel): MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForCausalLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1267,8 +1265,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel): @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING) class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1376,8 +1372,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"predictions"] - def __init__(self, config): super().__init__(config) @@ -1672,8 +1666,6 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel): MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1752,8 +1744,6 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index fcd49a8f8cf..06318679fae 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -191,7 +191,9 @@ class MobileBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -686,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel): pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST load_tf_weights = load_tf_weights_in_mobilebert base_model_prefix = "mobilebert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -923,11 +924,6 @@ class MobileBertModel(MobileBertPreTrainedModel): MOBILEBERT_START_DOCSTRING, ) class MobileBertForPreTraining(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1036,12 +1032,6 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING) class MobileBertForMaskedLM(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1350,8 +1340,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ) # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1553,8 +1541,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ) # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing class MobileBertForTokenClassification(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 93e5abe72a4..68bdad1c9fd 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -83,7 +83,9 @@ class MPNetEmbeddings(nn.Module): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs): if position_ids is None: @@ -479,8 +481,6 @@ MPNET_INPUTS_DOCSTRING = r""" MPNET_START_DOCSTRING, ) class MPNetModel(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -570,8 +570,6 @@ class MPNetModel(MPNetPreTrainedModel): class MPNetForMaskedLM(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder"] def __init__(self, config): @@ -679,8 +677,6 @@ class MPNetLMHead(nn.Module): MPNET_START_DOCSTRING, ) class MPNetForSequenceClassification(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -773,8 +769,6 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel): MPNET_START_DOCSTRING, ) class MPNetForMultipleChoice(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -863,9 +857,6 @@ class MPNetForMultipleChoice(MPNetPreTrainedModel): MPNET_START_DOCSTRING, ) class MPNetForTokenClassification(MPNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -962,9 +953,6 @@ class MPNetClassificationHead(nn.Module): MPNET_START_DOCSTRING, ) class MPNetForQuestionAnswering(MPNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index dfeb3c10a91..03e3581cf05 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -1316,18 +1316,8 @@ class MT5Model(MT5PreTrainedModel): ```""" model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5 @@ -1552,15 +1542,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 @@ -1897,13 +1879,6 @@ class MT5EncoderModel(MT5PreTrainedModel): model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5 @@ -2029,14 +2004,7 @@ class MT5EncoderModel(MT5PreTrainedModel): MT5_START_DOCSTRING, ) class MT5ForQuestionAnswering(MT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5 diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index d135ee558d1..92e393b39e2 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel): config_class = MvpConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] def _init_weights(self, module): std = self.config.init_std @@ -1300,8 +1299,7 @@ class MvpDecoder(MvpPreTrainedModel): MVP_START_DOCSTRING, ) class MvpModel(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + _keys_to_ignore_on_load_unexpected = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MvpConfig): @@ -1438,7 +1436,6 @@ class MvpModel(MvpPreTrainedModel): "The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING ) class MvpForConditionalGeneration(MvpPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: MvpConfig): @@ -1611,8 +1608,6 @@ class MvpForConditionalGeneration(MvpPreTrainedModel): MVP_START_DOCSTRING, ) class MvpForSequenceClassification(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MvpConfig, **kwargs): @@ -1740,8 +1735,6 @@ class MvpForSequenceClassification(MvpPreTrainedModel): MVP_START_DOCSTRING, ) class MvpForQuestionAnswering(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -1873,7 +1866,6 @@ class MvpDecoderWrapper(MvpPreTrainedModel): class MvpForCausalLM(MvpPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 97c5b5a90ec..8d66bfe41fa 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -163,7 +163,7 @@ class NezhaRelativePositionsEncoding(nn.Module): my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) - self.register_buffer("positions_encoding", positions_encoding) + self.register_buffer("positions_encoding", positions_encoding, persistent=False) def forward(self, length): return self.positions_encoding[:length, :length, :] @@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "nezha" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"positions_encoding"] def _init_weights(self, module): """Initialize the weights""" @@ -1037,7 +1036,6 @@ class NezhaModel(NezhaPreTrainedModel): NEZHA_START_DOCSTRING, ) class NezhaForPreTraining(NezhaPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1140,8 +1138,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel): @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING) class NezhaForMaskedLM(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1542,8 +1538,6 @@ class NezhaForMultipleChoice(NezhaPreTrainedModel): NEZHA_START_DOCSTRING, ) class NezhaForTokenClassification(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1623,8 +1617,6 @@ class NezhaForTokenClassification(NezhaPreTrainedModel): NEZHA_START_DOCSTRING, ) class NezhaForQuestionAnswering(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 3585b1d3b62..21731455584 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -183,7 +183,7 @@ class NllbMoeSinusoidalPositionalEmbedding(nn.Module): # in forward put the weights on the correct dtype and device of the param emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) - self.register_buffer("weights", emb_weights) + self.register_buffer("weights", emb_weights, persistent=False) @staticmethod def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): @@ -1500,14 +1500,6 @@ class NllbMoeDecoder(NllbMoePreTrainedModel): NLLB_MOE_START_DOCSTRING, ) class NllbMoeModel(NllbMoePreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - "encoder.embed_positions.weights", - "encoder.embed_positions.bias", - "decoder.embed_positions.weights", - "decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: NllbMoeConfig): @@ -1641,17 +1633,6 @@ class NllbMoeModel(NllbMoePreTrainedModel): ) class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"encoder.embed_positions.weights", - r"encoder.embed_positions.bias", - r"decoder.embed_positions.weights", - r"decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: NllbMoeConfig): diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index b859b0db1d4..607deb7b0ab 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -64,7 +64,9 @@ class NystromformerEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -458,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel): config_class = NystromformerConfig base_model_prefix = "nystromformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -658,7 +659,6 @@ class NystromformerModel(NystromformerPreTrainedModel): @add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING) class NystromformerForMaskedLM(NystromformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py index 16ad554dc31..84d5c6e78fa 100644 --- a/src/transformers/models/open_llama/modeling_open_llama.py +++ b/src/transformers/models/open_llama/modeling_open_llama.py @@ -368,7 +368,6 @@ class OpenLlamaPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OpenLlamaDecoderLayer"] - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -825,8 +824,6 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel): ) # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 0949b2f7dac..23f8fc8bc7d 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -141,7 +141,9 @@ class Attention(nn.Module): if n_state % config.n_head != 0: raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}") self.register_buffer( - "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions) + "bias", + torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions), + persistent=False, ) self.n_head = config.n_head self.split_size = n_state @@ -274,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): config_class = OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights.""" @@ -407,7 +408,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)]) - self.register_buffer("position_ids", torch.arange(config.n_positions)) + self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False) # Initialize weights and apply final processing self.post_init() @@ -529,7 +530,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -621,7 +621,6 @@ input sequence). OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 5ad783b92da..a473d9bd5b6 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -399,7 +399,6 @@ class OPTPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OPTDecoderLayer"] - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.init_std @@ -817,7 +816,6 @@ class OPTModel(OPTPreTrainedModel): class OPTForCausalLM(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1025,8 +1023,6 @@ class OPTForCausalLM(OPTPreTrainedModel): OPT_START_DOCSTRING, ) class OPTForSequenceClassification(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config: OPTConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1147,8 +1143,6 @@ class OPTForSequenceClassification(OPTPreTrainedModel): OPT_START_DOCSTRING, ) class OPTForQuestionAnswering(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config: OPTConfig): super().__init__(config) self.model = OPTModel(config) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index f65a0688578..34ee828a740 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -304,7 +304,7 @@ class OwlViTVisionEmbeddings(nn.Module): self.num_patches = (config.image_size // config.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -325,7 +325,9 @@ class OwlViTTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel): config_class = OwlViTConfig base_model_prefix = "owlvit" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] _no_split_modules = ["OwlViTEncoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 3eac50b327f..e9121655d13 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -1156,7 +1156,6 @@ class PegasusDecoder(PegasusPreTrainedModel): PEGASUS_START_DOCSTRING, ) class PegasusModel(PegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PegasusConfig): @@ -1309,15 +1308,7 @@ class PegasusModel(PegasusPreTrainedModel): ) class PegasusForConditionalGeneration(PegasusPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PegasusConfig): @@ -1518,7 +1509,6 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel): class PegasusForCausalLM(PegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 0763aec360f..caf736ba3ad 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1391,7 +1391,6 @@ class PegasusXDecoder(PegasusXPreTrainedModel): PEGASUS_X_START_DOCSTRING, ) class PegasusXModel(PegasusXPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PegasusXConfig): @@ -1536,14 +1535,6 @@ class PegasusXModel(PegasusXPreTrainedModel): @add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING) class PegasusXForConditionalGeneration(PegasusXPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PegasusXConfig): diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 2db104a5a11..b9cfff26a26 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1597,14 +1597,6 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel): class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel): config_class = Pix2StructConfig main_input_name = "flattened_patches" - - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] _tied_weights_keys = ["decoder.lm_head.weight"] def __init__(self, config: Pix2StructConfig): diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 30d9bd0ddc3..cf2901d43d2 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1132,7 +1132,6 @@ class PLBartDecoder(PLBartPreTrainedModel): PLBART_START_DOCSTRING, ) class PLBartModel(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PLBartConfig): @@ -1251,14 +1250,7 @@ class PLBartModel(PLBartPreTrainedModel): ) class PLBartForConditionalGeneration(PLBartPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PLBartConfig): @@ -1423,7 +1415,6 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel): PLBART_START_DOCSTRING, ) class PLBartForSequenceClassification(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PLBartConfig, **kwargs): @@ -1562,7 +1553,6 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base class PLBartForCausalLM(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 9160d5e1eb4..1b771705ab7 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -1744,7 +1744,6 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel): PROPHETNET_START_DOCSTRING, ) class ProphetNetModel(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] def __init__(self, config: ProphetNetConfig): @@ -1874,11 +1873,6 @@ class ProphetNetModel(ProphetNetPreTrainedModel): PROPHETNET_START_DOCSTRING, ) class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "decoder.word_embeddings.weight", - "encoder.word_embeddings.weight", - "lm_head.weight", - ] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] def __init__(self, config: ProphetNetConfig): @@ -2091,7 +2085,6 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel): PROPHETNET_START_DOCSTRING, ) class ProphetNetForCausalLM(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: ProphetNetConfig): diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index 47a34e95907..da60b8efea1 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -164,7 +164,9 @@ class QDQBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -738,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_qdqbert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1012,8 +1013,6 @@ class QDQBertModel(QDQBertPreTrainedModel): """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING ) class QDQBertLMHeadModel(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] def __init__(self, config): @@ -1166,8 +1165,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel): @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING) class QDQBertForMaskedLM(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] def __init__(self, config): @@ -1570,8 +1567,6 @@ class QDQBertForMultipleChoice(QDQBertPreTrainedModel): QDQBERT_START_DOCSTRING, ) class QDQBertForTokenClassification(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1650,8 +1645,6 @@ class QDQBertForTokenClassification(QDQBertPreTrainedModel): QDQBERT_START_DOCSTRING, ) class QDQBertForQuestionAnswering(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 019b26ef08e..1e615512c91 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel): """ config_class = RagConfig base_model_prefix = "rag" - _keys_to_ignore_on_load_missing = [r"position_ids"] @classmethod def from_pretrained(cls, *args, **kwargs): diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index f68fc04105d..2e675a4d342 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -178,7 +178,9 @@ class RealmEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel): config_class = RealmConfig load_tf_weights = load_tf_weights_in_realm base_model_prefix = "realm" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1147,7 +1148,6 @@ class RealmBertModel(RealmPreTrainedModel): REALM_START_DOCSTRING, ) class RealmEmbedder(RealmPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.bias"] def __init__(self, config): @@ -1378,7 +1378,6 @@ class RealmScorer(RealmPreTrainedModel): REALM_START_DOCSTRING, ) class RealmKnowledgeAugEncoder(RealmPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1529,8 +1528,6 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel): @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING) class RealmReader(RealmPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 98b4577b67d..7f3979ad21e 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -352,10 +352,10 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) # save mask value here. Need fp32 and fp16 mask values - self.register_buffer("self_mask_value_float16", torch.tensor(-1e3)) - self.register_buffer("self_mask_value_float32", torch.tensor(-1e5)) - self.register_buffer("mask_value_float16", torch.tensor(-1e4)) - self.register_buffer("mask_value_float32", torch.tensor(-1e9)) + self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False) + self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False) + self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False) + self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False) def forward( self, @@ -1049,8 +1049,8 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin): self.dropout = config.local_attention_probs_dropout_prob # save mask value here - self.register_buffer("mask_value_float16", torch.tensor(-1e4)) - self.register_buffer("mask_value_float32", torch.tensor(-1e9)) + self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False) + self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False) def forward( self, @@ -2185,7 +2185,6 @@ class ReformerModel(ReformerPreTrainedModel): @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING) class ReformerModelWithLMHead(ReformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index da4ad960851..e0ab18088aa 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -158,7 +158,9 @@ class RemBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -654,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_rembert base_model_prefix = "rembert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1016,7 +1017,6 @@ class RemBertForMaskedLM(RemBertPreTrainedModel): """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING ) class RemBertForCausalLM(RemBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index b0f13692460..cf71ceba7c4 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -80,7 +80,9 @@ class RobertaEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -614,15 +616,6 @@ class RobertaPreTrainedModel(PreTrainedModel): if isinstance(module, RobertaEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - ROBERTA_START_DOCSTRING = r""" @@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -881,9 +872,6 @@ class RobertaModel(RobertaPreTrainedModel): """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING ) class RobertaForCausalLM(RobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -895,9 +883,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel): self.roberta = RobertaModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1036,9 +1021,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel): @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING) class RobertaForMaskedLM(RobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1053,9 +1035,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel): self.roberta = RobertaModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1173,8 +1152,6 @@ class RobertaLMHead(nn.Module): ROBERTA_START_DOCSTRING, ) class RobertaForSequenceClassification(RobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1274,8 +1251,6 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel): ROBERTA_START_DOCSTRING, ) class RobertaForMultipleChoice(RobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1368,9 +1343,6 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel): ROBERTA_START_DOCSTRING, ) class RobertaForTokenClassification(RobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1478,9 +1450,6 @@ class RobertaClassificationHead(nn.Module): ROBERTA_START_DOCSTRING, ) class RobertaForQuestionAnswering(RobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index b1e02e27f13..c9b455716fc 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -83,7 +83,9 @@ class RobertaPreLayerNormEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -617,15 +619,6 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel): if isinstance(module, RobertaPreLayerNormEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" @@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -886,9 +877,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -902,9 +890,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.lm_head = RobertaPreLayerNormLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1045,9 +1030,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING ) class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm @@ -1063,9 +1045,6 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.lm_head = RobertaPreLayerNormLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1185,8 +1164,6 @@ class RobertaPreLayerNormLMHead(nn.Module): ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1286,8 +1263,6 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1382,9 +1357,6 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1492,9 +1464,6 @@ class RobertaPreLayerNormClassificationHead(nn.Module): ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 7647c14a9ea..c57537ecf3e 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -190,7 +190,9 @@ class RoCBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_roc_bert base_model_prefix = "roc_bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1081,7 +1082,6 @@ class RoCBertModel(RoCBertPreTrainedModel): ROC_BERT_START_DOCSTRING, ) class RoCBertForPreTraining(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1267,8 +1267,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel): @add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING) class RoCBertForMaskedLM(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert @@ -1409,8 +1407,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel): """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING ) class RoCBertForCausalLM(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert @@ -1804,8 +1800,6 @@ class RoCBertForMultipleChoice(RoCBertPreTrainedModel): ROC_BERT_START_DOCSTRING, ) class RoCBertForTokenClassification(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert def __init__(self, config): super().__init__(config) @@ -1892,8 +1886,6 @@ class RoCBertForTokenClassification(RoCBertPreTrainedModel): ROC_BERT_START_DOCSTRING, ) class RoCBertForQuestionAnswering(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index b966bf4490a..ad91766f966 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_roformer base_model_prefix = "roformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [] - _keys_to_ignore_on_load_unexpected = [ - r"roformer.embeddings_project.weight", - r"roformer.embeddings_project.bias", - ] def _init_weights(self, module): """Initialize the weights""" @@ -952,7 +947,6 @@ class RoFormerModel(RoFormerPreTrainedModel): @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING) class RoFormerForMaskedLM(RoFormerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1055,7 +1049,6 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel): """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING ) class RoFormerForCausalLM(RoFormerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index c3cbaa9176f..43d88232e36 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1190,7 +1190,6 @@ SAM_INPUTS_DOCSTRING = r""" SAM_START_DOCSTRING, ) class SamModel(SamPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] def __init__(self, config): diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 6b0869c87ad..67b4bf1a0c6 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel): base_model_prefix = "sew" main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 7f7c1977d69..6ae717d9a28 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel): config_class = SEWDConfig base_model_prefix = "sew-d" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 862dcac2ce7..1af805a1790 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1266,17 +1266,6 @@ class Speech2TextModel(Speech2TextPreTrainedModel): ) class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"model.encoder.embed_positions.weights", - r"model.decoder.embed_positions.weights", - r"lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"model.encoder.embed_positions.weights", - r"model.decoder.embed_positions.weights", - ] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: Speech2TextConfig): diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index a04fd82d4b8..822025e40ae 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -756,7 +756,6 @@ class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel): SPEECH_TO_TEXT_2_START_DOCSTRING, ) class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 301ed54af4d..b77d775714a 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -441,7 +441,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module): pe[:, 1::2] = torch.cos(position.float() * div_term) pe = pe.unsqueeze(0) super().__init__() - self.register_buffer("pe", pe) + self.register_buffer("pe", pe, persistent=False) self.dropout = nn.Dropout(p=dropout) self.dim = dim self.alpha = torch.nn.Parameter(torch.tensor(1.0)) @@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel): main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - def _init_weights(self, module): """Initialize the weights""" if isinstance(module, SpeechT5PositionalConvEmbedding): @@ -2326,13 +2324,6 @@ class SpeechT5Model(SpeechT5PreTrainedModel): SPEECHT5_START_DOCSTRING, ) class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - r"text_decoder_postnet.lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"] def __init__(self, config: SpeechT5Config): @@ -2638,9 +2629,6 @@ def _generate_speech( SPEECHT5_START_DOCSTRING, ) class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [] - _keys_to_ignore_on_save = [] - main_input_name = "input_ids" def __init__(self, config: SpeechT5Config): @@ -2859,13 +2847,6 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): SPEECHT5_START_DOCSTRING, ) class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] - _keys_to_ignore_on_save = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] - def __init__(self, config: SpeechT5Config): super().__init__(config) diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 6e636fb695d..193481e57f2 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -61,7 +61,9 @@ class SplinterEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( @@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel): config_class = SplinterConfig base_model_prefix = "splinter" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 3264d16ebbb..b82de3a0b06 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -64,7 +64,9 @@ class SqueezeBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: @@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel): config_class = SqueezeBertConfig base_model_prefix = "transformer" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -643,11 +644,6 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel): @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING) class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 008e23531ac..98899af150a 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -1337,7 +1337,6 @@ num_heads)`. SWITCH_TRANSFORMERS_START_DOCSTRING, ) class SwitchTransformersModel(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: SwitchTransformersConfig): @@ -1506,11 +1505,6 @@ class SwitchTransformersModel(SwitchTransformersPreTrainedModel): """SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING ) class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: SwitchTransformersConfig): @@ -1819,7 +1813,6 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod SWITCH_TRANSFORMERS_START_DOCSTRING, ) class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: SwitchTransformersConfig): diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index f1f7b17c7bf..7934b10b0a2 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1326,12 +1326,8 @@ num_heads)`. T5_START_DOCSTRING, ) class T5Model(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] @@ -1530,13 +1526,8 @@ class T5Model(T5PreTrainedModel): @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -1845,7 +1836,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel): T5_START_DOCSTRING, ) class T5EncoderModel(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: T5Config): @@ -1963,14 +1953,7 @@ class T5EncoderModel(T5PreTrainedModel): T5_START_DOCSTRING, ) class T5ForQuestionAnswering(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: T5Config): diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 1621653f3ee..832a731b5bf 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -998,7 +998,6 @@ class TapasModel(TapasPreTrainedModel): @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING) class TapasForMaskedLM(TapasPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] config_class = TapasConfig base_model_prefix = "tapas" diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py index e8ecedccb5e..1f634a9893d 100644 --- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py +++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py @@ -284,6 +284,7 @@ class CausalSelfAttention(nn.Module): torch.tril(torch.ones(config.block_size, config.block_size)).view( 1, 1, config.block_size, config.block_size ), + persistent=False, ) # mask previous value estimates diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index d0f6cc029fb..8ba96905242 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -1002,7 +1002,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel): TRANSFO_XL_START_DOCSTRING, ) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] def __init__(self, config): @@ -1191,8 +1190,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): TRANSFO_XL_START_DOCSTRING, ) class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 3ad4ff1bac5..cd4e522bf54 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -788,7 +788,6 @@ class TrOCRDecoderWrapper(TrOCRPreTrainedModel): TROCR_START_DOCSTRING, ) class TrOCRForCausalLM(TrOCRPreTrainedModel): - _keys_to_ignore_on_load_missing = ["output_projection.weight"] _tied_weights_keys = ["output_projection.weight"] def __init__(self, config): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 16c08bbbf3e..9737433089f 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel): config_class = UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index b57369ea6f7..4c4ab4b90f3 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): config_class = UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 6ee1e396a62..4d5283bae60 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -249,7 +249,9 @@ class TextEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -886,7 +888,6 @@ class ViltPooler(nn.Module): VILT_START_DOCSTRING, ) class ViltForMaskedLM(ViltPreTrainedModel): - _keys_to_ignore_on_load_missing = ["mlm_score.decoder.bias"] _tied_weights_keys = ["mlm_score.decoder.weight", "mlm_score.decoder.bias"] def __init__(self, config): @@ -1419,8 +1420,6 @@ class ViltForImagesAndTextClassification(ViltPreTrainedModel): VILT_START_DOCSTRING, ) class ViltForTokenClassification(ViltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 0bef6e4af9d..0706eb1f1c4 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -78,7 +78,9 @@ class VisualBertEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) # For Visual Features # Token type and position embedding for image features @@ -531,7 +533,6 @@ class VisualBertPreTrainedModel(PreTrainedModel): config_class = VisualBertConfig base_model_prefix = "visual_bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -871,7 +872,6 @@ class VisualBertModel(VisualBertPreTrainedModel): VISUAL_BERT_START_DOCSTRING, ) class VisualBertForPreTraining(VisualBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1462,7 +1462,6 @@ class VisualBertRegionToPhraseAttention(nn.Module): VISUAL_BERT_START_DOCSTRING, ) class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 1c8965c9600..3e48dc530de 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1089,7 +1089,6 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): config_class = Wav2Vec2Config base_model_prefix = "wav2vec2" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 7a757d0a51f..d5836de3394 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1087,7 +1087,6 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): config_class = Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index d782a47402f..d573ee601b4 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -974,7 +974,6 @@ class WavLMPreTrainedModel(PreTrainedModel): config_class = WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index cffb2810838..fa9eae4c479 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1225,8 +1225,6 @@ class WhisperDecoder(WhisperPreTrainedModel): WHISPER_START_DOCSTRING, ) class WhisperModel(WhisperPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"proj_out.weight"] - def __init__(self, config: WhisperConfig): super().__init__(config) @@ -1396,14 +1394,6 @@ class WhisperModel(WhisperPreTrainedModel): ) class WhisperForConditionalGeneration(WhisperPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"proj_out.weight", - ] - _keys_to_ignore_on_save = [ - r"proj_out.weight", - ] _tied_weights_keys = ["proj_out.weight"] def __init__(self, config: WhisperConfig): diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 8db4ee0fd19..bcf91b0b51d 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -139,7 +139,7 @@ class XCLIPVisionEmbeddings(nn.Module): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -162,7 +162,9 @@ class XCLIPTextEmbeddings(nn.Module): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -481,7 +483,6 @@ class XCLIPPreTrainedModel(PreTrainedModel): config_class = XCLIPConfig base_model_prefix = "x_clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index b7172127d90..9e578cebf19 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -749,14 +749,6 @@ class XGLMModel(XGLMPreTrainedModel): ) class XGLMForCausalLM(XGLMPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"model.embed_positions.weights", - r"embed_positions.weights", - r"lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"model.embed_positions.weights", - ] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index a448b4b1163..d342cde80d3 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -391,8 +391,6 @@ XLM_INPUTS_DOCSTRING = r""" XLM_START_DOCSTRING, ) class XLMModel(XLMPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -461,7 +459,9 @@ class XLMModel(XLMPreTrainedModel): # Initialize weights and apply final processing self.post_init() - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def get_input_embeddings(self): return self.embeddings @@ -670,7 +670,6 @@ class XLMPredLayer(nn.Module): XLM_START_DOCSTRING, ) class XLMWithLMHeadModel(XLMPreTrainedModel): - _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"] _tied_weights_keys = ["pred_layer.proj.weight"] def __init__(self, config): diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py index 2d14bfb6a7b..c84e3fac5ae 100644 --- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -1768,7 +1768,6 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel): ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetModel(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] def __init__(self, config: XLMProphetNetConfig): @@ -1899,11 +1898,6 @@ class XLMProphetNetModel(XLMProphetNetPreTrainedModel): ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "decoder.word_embeddings.weight", - "encoder.word_embeddings.weight", - "lm_head.weight", - ] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] def __init__(self, config: XLMProphetNetConfig): @@ -2119,7 +2113,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: XLMProphetNetConfig): diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index ae8d51a3f8e..881f60875db 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -81,7 +81,9 @@ class XLMRobertaEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -616,15 +618,6 @@ class XLMRobertaPreTrainedModel(PreTrainedModel): if isinstance(module, XLMRobertaEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - XLM_ROBERTA_START_DOCSTRING = r""" @@ -713,8 +706,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -885,9 +876,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -899,9 +887,6 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel): self.roberta = XLMRobertaModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1044,9 +1029,6 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1061,9 +1043,6 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel): self.roberta = XLMRobertaModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1183,8 +1162,6 @@ class XLMRobertaLMHead(nn.Module): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1285,8 +1262,6 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1382,9 +1357,6 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1494,9 +1466,6 @@ class XLMRobertaClassificationHead(nn.Module): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index fb86717e1d7..4299880e0c4 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -73,7 +73,9 @@ class XLMRobertaXLEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -599,15 +601,6 @@ class XLMRobertaXLPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - XLM_ROBERTA_XL_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -679,8 +672,6 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel): an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -850,9 +841,6 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -864,9 +852,6 @@ class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel): self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaXLLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() def get_output_embeddings(self): @@ -1001,9 +986,6 @@ class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel): """XLM-RoBERTa-xlarge Model with a `language modeling` head on top.""", XLM_ROBERTA_XL_START_DOCSTRING ) class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1018,9 +1000,6 @@ class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel): self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaXLLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() def get_output_embeddings(self): @@ -1129,8 +1108,6 @@ class XLMRobertaXLLMHead(nn.Module): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1225,8 +1202,6 @@ class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1318,9 +1293,6 @@ class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1432,9 +1404,6 @@ class XLMRobertaXLClassificationHead(nn.Module): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index bea8ab643b1..87bf48d61ed 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -1292,7 +1292,6 @@ class XLNetModel(XLNetPreTrainedModel): XLNET_START_DOCSTRING, ) class XLNetLMHeadModel(XLNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_loss.weight"] _tied_weights_keys = ["lm_loss.weight"] def __init__(self, config): diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index d99b77fedda..c44cded4995 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -74,7 +74,9 @@ class XmodEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -682,16 +684,6 @@ class XmodPreTrainedModel(PreTrainedModel): if isinstance(module, XmodEncoder): module.gradient_checkpointing = value - # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel.update_keys_to_ignore - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - def set_default_language(self, language: str): """ Set the default language code for the model. This is used when the language is not specified in the input. @@ -811,8 +803,6 @@ class XmodModel(XmodPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -989,9 +979,6 @@ class XmodModel(XmodPreTrainedModel): XMOD_START_DOCSTRING, ) class XmodForCausalLM(XmodPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod @@ -1004,9 +991,6 @@ class XmodForCausalLM(XmodPreTrainedModel): self.roberta = XmodModel(config, add_pooling_layer=False) self.lm_head = XmodLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1152,9 +1136,6 @@ class XmodForCausalLM(XmodPreTrainedModel): XMOD_START_DOCSTRING, ) class XmodForMaskedLM(XmodPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod @@ -1170,9 +1151,6 @@ class XmodForMaskedLM(XmodPreTrainedModel): self.roberta = XmodModel(config, add_pooling_layer=False) self.lm_head = XmodLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1285,8 +1263,6 @@ class XmodLMHead(nn.Module): XMOD_START_DOCSTRING, ) class XmodForSequenceClassification(XmodPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1380,8 +1356,6 @@ class XmodForSequenceClassification(XmodPreTrainedModel): XMOD_START_DOCSTRING, ) class XmodForMultipleChoice(XmodPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1471,9 +1445,6 @@ class XmodForMultipleChoice(XmodPreTrainedModel): XMOD_START_DOCSTRING, ) class XmodForTokenClassification(XmodPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1576,9 +1547,6 @@ class XmodClassificationHead(nn.Module): XMOD_START_DOCSTRING, ) class XmodForQuestionAnswering(XmodPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 8c2ff9fa4e0..4d4ef9a4f50 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -252,7 +252,9 @@ class YosoEmbeddings(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -649,7 +651,6 @@ class YosoPreTrainedModel(PreTrainedModel): config_class = YosoConfig base_model_prefix = "yoso" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -849,11 +850,6 @@ class YosoModel(YosoPreTrainedModel): @add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING) class YosoForMaskedLM(YosoPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 49caa67d4f6..7ca78e23b7f 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -15,7 +15,6 @@ import unittest -from copy import deepcopy from transformers import RobertaConfig, is_torch_available from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device @@ -579,23 +578,3 @@ class RobertaModelIntegrationTest(TestCasePlus): # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach() self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4)) - - # XXX: this might be a candidate for common tests if we have many of those - def test_lm_head_ignore_keys(self): - keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"] - config = RobertaConfig.from_pretrained(ROBERTA_TINY) - config_tied = deepcopy(config) - config_tied.tie_word_embeddings = True - config_untied = deepcopy(config) - config_untied.tie_word_embeddings = False - for cls in [RobertaForMaskedLM, RobertaForCausalLM]: - model = cls(config_tied) - self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls) - - # the keys should be different when embeddings aren't tied - model = cls(config_untied) - self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls) - - # test that saving works with updated ignore keys - just testing that it doesn't fail - model.save_pretrained(self.get_auto_remove_tmp_dir()) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 07a8b16bfef..878e3c64730 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1562,7 +1562,7 @@ class ModelTesterMixin: @require_safetensors def test_can_use_safetensors(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model_tied = model_class(config) with tempfile.TemporaryDirectory() as d: @@ -1579,6 +1579,8 @@ class ModelTesterMixin: torch.testing.assert_close( v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}" ) + # Checking there was no complain of missing weights + self.assertEqual(infos["missing_keys"], []) # Checking the tensor sharing are correct ptrs = defaultdict(list) @@ -1595,6 +1597,25 @@ class ModelTesterMixin: f"The shared pointers are incorrect, found different pointers for keys {shared_names}", ) + def test_load_save_without_tied_weights(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.tie_word_embeddings = False + for model_class in self.all_model_classes: + model = model_class(config) + with tempfile.TemporaryDirectory() as d: + model.save_pretrained(d) + + model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True) + # Checking the state dicts are correct + reloaded_state = model_reloaded.state_dict() + for k, v in model.state_dict().items(): + self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded") + torch.testing.assert_close( + v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}" + ) + # Checking there was no complain of missing weights + self.assertEqual(infos["missing_keys"], []) + def test_tied_weights_keys(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() config.tie_word_embeddings = True @@ -1620,55 +1641,72 @@ class ModelTesterMixin: tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] tied_params = [group for group in tied_params if len(group) > 1] - self.assertListEqual(tied_params, []) + self.assertListEqual( + tied_params, + [], + f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", + ) - def test_tied_model_weights_key_ignore(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_model_weights_reload_no_missing_tied_weights(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - model_tied = model_class(config) - with tempfile.TemporaryDirectory() as d: - model_tied.save_pretrained(d) + model = model_class(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) # We are nuking ALL weights on file, so every parameter should # yell on load. We're going to detect if we yell too much, or too little. - with open(os.path.join(d, "pytorch_model.bin"), "wb") as f: + with open(os.path.join(tmp_dir, "pytorch_model.bin"), "wb") as f: torch.save({}, f) - model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True) - - # ! Actually we could use `state_dict()` and check iteratively the tensors which are the same (for instance using `tensor.data_ptr()`). to detect the duplicates. - # ```python - # model = GPT2LMHeadModel.from_pretrained("gpt2") - # "lm_head.weight" in model.state_dict().keys() # True - # "lm_head.weight" in model.named_parameters() # False - # In [6]: model.lm_head.weight.data_ptr() - # Out[6]: 139901378371648 - # In [9]: model.transformer.wte.weight.data_ptr() - # Out[9]: 139901378371648 # Same PTR, it's the same DATA ! we would need to check for stride too to be 100% accurate. - # ``` + model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True) prefix = f"{model_reloaded.base_model_prefix}." params = dict(model_reloaded.named_parameters()) params.update(dict(model_reloaded.named_buffers())) - # param_names = set(k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()) param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()} missing_keys = set(infos["missing_keys"]) extra_missing = missing_keys - param_names - # missed_missing = param_names - missing_keys + # Remove tied weights from extra missing: they are normally not warned as missing if their tied + # counterpart is present but here there are no weights at all so we do get the warning. + ptrs = collections.defaultdict(list) + for name, tensor in model_reloaded.state_dict().items(): + ptrs[id_tensor_storage(tensor)].append(name) + tied_params = [names for _, names in ptrs.items() if len(names) > 1] + for group in tied_params: + group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group} + # We remove the group from extra_missing if not all weights from group are in it + if len(group - extra_missing) > 0: + extra_missing = extra_missing - set(group) self.assertEqual( extra_missing, set(), - f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}", + f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. " + f"For debugging, tied parameters are {tied_params}", ) - # self.assertEqual( - # missed_missing, - # set(), - # f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real" - # " parameters", - # ) + missed_missing = param_names - missing_keys + # Remove nonpersistent buffers from missed_missing + buffers = [n for n, _ in model_reloaded.named_buffers()] + nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()} + nonpersistent_buffers = { + k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers + } + missed_missing = missed_missing - nonpersistent_buffers + + if model_reloaded._keys_to_ignore_on_load_missing is None: + expected_missing = set() + else: + expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing) + self.assertEqual( + missed_missing, + expected_missing, + f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real" + " parameters. If they are non persistent buffers make sure to instantiate them with" + " `persistent=False`", + ) def test_model_outputs_equivalence(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 3b441ec7e58..17ddf1963a2 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -500,8 +500,8 @@ class ModelUtilsTest(TestCasePlus): self.assertTrue(os.path.isfile(weights_index_file)) self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME))) - for i in range(1, 6): - weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"]) + for i in range(1, 5): + weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["bin"]) weights_name_file = os.path.join(tmp_dir, weights_name) self.assertTrue(os.path.isfile(weights_name_file)) @@ -546,8 +546,8 @@ class ModelUtilsTest(TestCasePlus): self.assertTrue(os.path.isfile(weights_index_file)) self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) - for i in range(1, 6): - weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"]) + for i in range(1, 5): + weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["safetensors"]) weights_name_file = os.path.join(tmp_dir, weights_name) self.assertTrue(os.path.isfile(weights_name_file))