Clean load keys (#24505)

* Preliminary work on some models

* Fix test load missing and make sure nonpersistent buffers are tested

* Always ignore nonpersistent buffers if in state_dict

* Treat models

* More models

* Treat remaining models

* Fix quality

* Fix tests

* Remove draft

* This test is not needed anymore

* Fix copies

* Fix last test

* Newly added models

* Fix last tests

* Address review comments
This commit is contained in:
Sylvain Gugger 2023-06-27 14:45:40 -04:00 committed by GitHub
parent 53194991e9
commit 8e5d1619b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
138 changed files with 320 additions and 1140 deletions

View File

@ -320,8 +320,9 @@ def shard_checkpoint(
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
# If this weight is going to tip up over the maximal size, we split.
if last_block_size + weight_size > max_shard_size:
# If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
# weight in the current shard.
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
sharded_state_dicts.append({})
last_block_size = 0
@ -3044,15 +3045,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
expected_keys = [".".join([prefix, s]) for s in expected_keys]
missing_keys = list(set(expected_keys) - set(loaded_keys))
unexpected_keys = list(set(loaded_keys) - set(expected_keys))
unexpected_keys = set(loaded_keys) - set(expected_keys)
# Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model
# buffers
model_buffers = {n for n, _ in model.named_buffers()}
if remove_prefix_from_model:
model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers}
elif add_prefix_to_model:
model_buffers = {".".join([prefix, key]) for key in model_buffers}
unexpected_keys = list(unexpected_keys - model_buffers)
if is_accelerate_available():
model.tie_weights()
tied_params = find_tied_parameters(model)
else:
tied_params = []
model.tie_weights()
ptrs = collections.defaultdict(list)
for name, tensor in model.state_dict().items():
id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor)
ptrs[id_tensor].append(name)
# These are all the pointers of shared tensors.
tied_params = [names for _, names in ptrs.items() if len(names) > 1]
for group in tied_params:
if remove_prefix_from_model:
group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group]
elif add_prefix_to_model:
group = [".".join([prefix, key]) for key in group]
missing_in_group = [k for k in missing_keys if k in group]
if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
missing_keys = [k for k in missing_keys if k not in missing_in_group]

View File

@ -208,7 +208,9 @@ class AlbertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@ -507,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel):
config_class = AlbertConfig
load_tf_weights = load_tf_weights_in_albert
base_model_prefix = "albert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights."""
@ -760,11 +761,6 @@ class AlbertModel(AlbertPreTrainedModel):
)
class AlbertForPreTraining(AlbertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
_keys_to_ignore_on_load_missing = [
"predictions.decoder.weight",
"predictions.decoder.bias",
"embeddings.position_ids",
]
def __init__(self, config: AlbertConfig):
super().__init__(config)
@ -912,13 +908,7 @@ class AlbertSOPHead(nn.Module):
ALBERT_START_DOCSTRING,
)
class AlbertForMaskedLM(AlbertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
_keys_to_ignore_on_load_missing = [
"predictions.decoder.weight",
"predictions.decoder.bias",
"embeddings.position_ids",
]
def __init__(self, config):
super().__init__(config)
@ -1133,8 +1123,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING,
)
class AlbertForTokenClassification(AlbertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config: AlbertConfig):
super().__init__(config)
self.num_labels = config.num_labels
@ -1218,8 +1206,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING,
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config: AlbertConfig):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -687,7 +687,9 @@ class AlignTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -1176,7 +1178,6 @@ class AlignPreTrainedModel(PreTrainedModel):
config_class = AlignConfig
base_model_prefix = "align"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -216,7 +216,9 @@ class AltRobertaEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -1016,7 +1018,7 @@ class AltCLIPVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
@ -1038,7 +1040,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
config_class = AltCLIPConfig
base_model_prefix = "altclip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -506,7 +506,7 @@ class BartPretrainedModel(PreTrainedModel):
config_class = BartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
_keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
_no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
@ -1170,7 +1170,6 @@ class BartDecoder(BartPretrainedModel):
BART_START_DOCSTRING,
)
class BartModel(BartPretrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BartConfig):
@ -1300,12 +1299,7 @@ class BartModel(BartPretrainedModel):
class BartForConditionalGeneration(BartPretrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_keys_to_ignore_on_load_missing = [
"final_logits_bias",
"lm_head.weight",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
def __init__(self, config: BartConfig):
super().__init__(config)
@ -1478,7 +1472,6 @@ class BartForConditionalGeneration(BartPretrainedModel):
BART_START_DOCSTRING,
)
class BartForSequenceClassification(BartPretrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BartConfig, **kwargs):
@ -1609,7 +1602,6 @@ class BartForSequenceClassification(BartPretrainedModel):
BART_START_DOCSTRING,
)
class BartForQuestionAnswering(BartPretrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
@ -1748,7 +1740,6 @@ class BartDecoderWrapper(BartPretrainedModel):
BART_START_DOCSTRING,
)
class BartForCausalLM(BartPretrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -459,7 +459,7 @@ class BeitRelativePositionBias(nn.Module):
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
self.register_buffer("relative_position_index", relative_position_index, persistent=False)
def forward(self) -> torch.Tensor:
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(

View File

@ -192,7 +192,9 @@ class BertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_bert
base_model_prefix = "bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1053,7 +1054,6 @@ class BertModel(BertPreTrainedModel):
BERT_START_DOCSTRING,
)
class BertForPreTraining(BertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
@ -1160,8 +1160,6 @@ class BertForPreTraining(BertPreTrainedModel):
"""Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
)
class BertLMHeadModel(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
@ -1301,8 +1299,6 @@ class BertLMHeadModel(BertPreTrainedModel):
@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
class BertForMaskedLM(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
@ -1715,8 +1711,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
BERT_START_DOCSTRING,
)
class BertForTokenClassification(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1800,8 +1794,6 @@ class BertForTokenClassification(BertPreTrainedModel):
BERT_START_DOCSTRING,
)
class BertForQuestionAnswering(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -556,7 +556,9 @@ class BertGenerationEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
if input_ids is not None:
@ -588,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
config_class = BertGenerationConfig
base_model_prefix = "bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -860,7 +861,6 @@ class BertGenerationOnlyLMHead(nn.Module):
BERT_GENERATION_START_DOCSTRING,
)
class BertGenerationDecoder(BertGenerationPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):

View File

@ -257,7 +257,9 @@ class BigBirdEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -1765,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_big_bird
base_model_prefix = "bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -2261,7 +2262,6 @@ class BigBirdModel(BigBirdPreTrainedModel):
class BigBirdForPreTraining(BigBirdPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -2368,7 +2368,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
class BigBirdForMaskedLM(BigBirdPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -2513,12 +2512,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
"""BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
)
class BigBirdForCausalLM(BigBirdPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"position_ids",
r"predictions.decoder.bias",
"cls.predictions.decoder.weight",
"cls.predictions.decoder.bias",
]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):

View File

@ -2358,7 +2358,6 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BigBirdPegasusConfig):
@ -2491,12 +2490,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_keys_to_ignore_on_load_missing = [
"final_logits_bias",
"lm_head.weight",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
def __init__(self, config: BigBirdPegasusConfig):
super().__init__(config)
@ -2669,7 +2663,6 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BigBirdPegasusConfig, **kwargs):
@ -2799,7 +2792,6 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
BIGBIRD_PEGASUS_START_DOCSTRING,
)
class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
@ -2932,7 +2924,6 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -646,7 +646,6 @@ class BioGptModel(BioGptPreTrainedModel):
"""BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
)
class BioGptForCausalLM(BioGptPreTrainedModel):
_keys_to_ignore_on_load_missing = ["output_projection.weight"]
_tied_weights_keys = ["output_projection.weight"]
def __init__(self, config):

View File

@ -1102,7 +1102,6 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
BLENDERBOT_START_DOCSTRING,
)
class BlenderbotModel(BlenderbotPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: BlenderbotConfig):
@ -1244,14 +1243,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
)
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
"decoder.embed_tokens.weight",
"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: BlenderbotConfig):
@ -1441,7 +1433,6 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -1096,7 +1096,6 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
BLENDERBOT_SMALL_START_DOCSTRING,
)
class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: BlenderbotSmallConfig):
@ -1226,14 +1225,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
)
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: BlenderbotSmallConfig):
@ -1408,7 +1400,6 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -255,7 +255,9 @@ class BlipTextEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -419,7 +421,6 @@ class BlipPreTrainedModel(PreTrainedModel):
config_class = BlipConfig
base_model_prefix = "blip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -927,7 +928,6 @@ class BlipModel(BlipPreTrainedModel):
)
class BlipForConditionalGeneration(BlipPreTrainedModel):
config_class = BlipConfig
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
main_input_name = "pixel_values"
@ -1100,7 +1100,6 @@ class BlipForConditionalGeneration(BlipPreTrainedModel):
)
class BlipForQuestionAnswering(BlipPreTrainedModel):
config_class = BlipConfig
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
def __init__(self, config: BlipConfig):

View File

@ -56,7 +56,9 @@ class BlipTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.config = config
@ -552,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel):
config_class = BlipTextConfig
base_model_prefix = "bert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -808,9 +809,6 @@ class BlipTextModel(BlipTextPreTrainedModel):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class BlipTextLMHeadModel(BlipTextPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)

View File

@ -273,12 +273,6 @@ class Blip2PreTrainedModel(PreTrainedModel):
config_class = Blip2Config
base_model_prefix = "blip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [
r"position_ids",
r"language_model.encoder.embed_tokens.weight",
r"language_model.decoder.embed_tokens.weight",
r"language_model.lm_head.weight",
]
_no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_keep_in_fp32_modules = ["wo"]

View File

@ -471,12 +471,6 @@ class BloomBlock(nn.Module):
class BloomPreTrainedModel(PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BloomConfig
base_model_prefix = "transformer"
supports_gradient_checkpointing = True
@ -826,7 +820,6 @@ class BloomModel(BloomPreTrainedModel):
BLOOM_START_DOCSTRING,
)
class BloomForCausalLM(BloomPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: BloomConfig):
@ -995,8 +988,6 @@ class BloomForCausalLM(BloomPreTrainedModel):
BLOOM_START_DOCSTRING,
)
class BloomForSequenceClassification(BloomPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
def __init__(self, config: BloomConfig):
super().__init__(config)
self.num_labels = config.num_labels
@ -1123,8 +1114,6 @@ class BloomForSequenceClassification(BloomPreTrainedModel):
BLOOM_START_DOCSTRING,
)
class BloomForTokenClassification(BloomPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
def __init__(self, config: BloomConfig):
super().__init__(config)
self.num_labels = config.num_labels
@ -1226,8 +1215,6 @@ class BloomForTokenClassification(BloomPreTrainedModel):
BLOOM_START_DOCSTRING,
)
class BloomForQuestionAnswering(BloomPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = BloomModel(config)

View File

@ -280,7 +280,7 @@ class BridgeTowerVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
@ -880,7 +880,9 @@ class BridgeTowerTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -1038,8 +1040,6 @@ class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
config_class = BridgeTowerTextConfig
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config

View File

@ -94,7 +94,9 @@ class CamembertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -627,15 +629,6 @@ class CamembertPreTrainedModel(PreTrainedModel):
if isinstance(module, CamembertEncoder):
module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
CAMEMBERT_INPUTS_DOCSTRING = r"""
Args:
@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
_no_split_modules = []
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert
@ -935,9 +927,6 @@ class CamembertModel(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
class CamembertForMaskedLM(CamembertPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
@ -952,9 +941,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
self.roberta = CamembertModel(config, add_pooling_layer=False)
self.lm_head = CamembertLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
@ -1042,8 +1028,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
class CamembertForSequenceClassification(CamembertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1144,8 +1128,6 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
class CamembertForMultipleChoice(CamembertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
@ -1241,9 +1223,6 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
class CamembertForTokenClassification(CamembertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1330,9 +1309,6 @@ class CamembertForTokenClassification(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
class CamembertForQuestionAnswering(CamembertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1431,9 +1407,6 @@ class CamembertForQuestionAnswering(CamembertPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base
class CamembertForCausalLM(CamembertPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
@ -1445,9 +1418,6 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
self.roberta = CamembertModel(config, add_pooling_layer=False)
self.lm_head = CamembertLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()

View File

@ -216,7 +216,9 @@ class CanineEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
@ -900,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_canine
base_model_prefix = "canine"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -121,7 +121,9 @@ class ChineseCLIPTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -190,7 +192,7 @@ class ChineseCLIPVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
@ -689,7 +691,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
config_class = ChineseCLIPConfig
base_model_prefix = "chinese_clip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -1166,7 +1166,9 @@ class ClapTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
)
@ -1677,7 +1679,6 @@ class ClapPreTrainedModel(PreTrainedModel):
config_class = ClapConfig
base_model_prefix = "clap"
supports_gradient_checkpointing = False
_keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1781,7 +1782,6 @@ class ClapTextModel(ClapPreTrainedModel):
"""
config_class = ClapTextConfig
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
def __init__(self, config, add_pooling_layer=True):
@ -1936,7 +1936,6 @@ class ClapTextModel(ClapPreTrainedModel):
@add_start_docstrings(CLAP_START_DOCSTRING)
class ClapModel(ClapPreTrainedModel):
config_class = ClapConfig
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config: ClapConfig):
super().__init__(config)

View File

@ -188,7 +188,7 @@ class CLIPVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
@ -210,7 +210,9 @@ class CLIPTextEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -410,7 +412,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
config_class = CLIPConfig
base_model_prefix = "clip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -181,7 +181,7 @@ class CLIPSegVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def interpolate_position_embeddings(self, new_size):
if len(new_size) != 2:
@ -230,7 +230,9 @@ class CLIPSegTextEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
config_class = CLIPSegConfig
base_model_prefix = "clip"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -83,6 +83,7 @@ class CodeGenAttention(nn.Module):
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.attn_dropout = nn.Dropout(config.attn_pdrop)
@ -600,7 +601,6 @@ class CodeGenModel(CodeGenPreTrainedModel):
CODEGEN_START_DOCSTRING,
)
class CodeGenForCausalLM(CodeGenPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -191,7 +191,9 @@ class ConvBertEmbeddings(nn.Module):
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -245,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_convbert
base_model_prefix = "convbert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -765,8 +765,6 @@ CONVBERT_INPUTS_DOCSTRING = r"""
CONVBERT_START_DOCSTRING,
)
class ConvBertModel(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
def __init__(self, config):
super().__init__(config)
self.embeddings = ConvBertEmbeddings(config)
@ -880,7 +878,6 @@ class ConvBertGeneratorPredictions(nn.Module):
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
class ConvBertForMaskedLM(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"]
_tied_weights_keys = ["generator.lm_head.weight"]
def __init__(self, config):
@ -992,8 +989,6 @@ class ConvBertClassificationHead(nn.Module):
CONVBERT_START_DOCSTRING,
)
class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1089,8 +1084,6 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
CONVBERT_START_DOCSTRING,
)
class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
def __init__(self, config):
super().__init__(config)
@ -1184,8 +1177,6 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
CONVBERT_START_DOCSTRING,
)
class ConvBertForTokenClassification(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1267,8 +1258,6 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
CONVBERT_START_DOCSTRING,
)
class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
def __init__(self, config):
super().__init__(config)

View File

@ -537,7 +537,6 @@ class CpmAntPreTrainedModel(PreTrainedModel):
config_class = CpmAntConfig
base_model_prefix = "cpmant"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -749,7 +748,6 @@ class CpmAntModel(CpmAntPreTrainedModel):
CPMANT_START_DOCSTRING,
)
class CpmAntForCausalLM(CpmAntPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: CpmAntConfig):

View File

@ -509,7 +509,6 @@ class CTRLModel(CTRLPreTrainedModel):
CTRL_START_DOCSTRING,
)
class CTRLLMHeadModel(CTRLPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -689,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
config_class = Data2VecAudioConfig
base_model_prefix = "data2vec_audio"
main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True
def _init_weights(self, module):

View File

@ -80,7 +80,9 @@ class Data2VecTextForTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -615,15 +617,6 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
if isinstance(module, Data2VecTextEncoder):
module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
DATA2VECTEXT_START_DOCSTRING = r"""
Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
@ -883,9 +874,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
"""Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
)
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
@ -897,9 +885,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.lm_head = Data2VecTextLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
@ -1038,9 +1023,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
@ -1055,9 +1037,6 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.lm_head = Data2VecTextLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
@ -1174,8 +1153,6 @@ class Data2VecTextLMHead(nn.Module):
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1273,8 +1250,6 @@ class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
@ -1369,9 +1344,6 @@ class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1478,9 +1450,6 @@ class Data2VecTextClassificationHead(nn.Module):
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -470,7 +470,7 @@ class Data2VecVisionRelativePositionBias(nn.Module):
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
self.register_buffer("relative_position_index", relative_position_index, persistent=False)
def forward(self) -> torch.Tensor:
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(

View File

@ -764,7 +764,9 @@ class DebertaEmbeddings(nn.Module):
self.config = config
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
if input_ids is not None:
@ -821,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel):
config_class = DebertaConfig
base_model_prefix = "deberta"
_keys_to_ignore_on_load_missing = ["position_ids"]
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
supports_gradient_checkpointing = True
@ -1020,8 +1021,6 @@ class DebertaModel(DebertaPreTrainedModel):
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaForMaskedLM(DebertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -1277,8 +1276,6 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
DEBERTA_START_DOCSTRING,
)
class DebertaForTokenClassification(DebertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1352,8 +1349,6 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
DEBERTA_START_DOCSTRING,
)
class DebertaForQuestionAnswering(DebertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -862,7 +862,9 @@ class DebertaV2Embeddings(nn.Module):
self.config = config
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
if input_ids is not None:
@ -920,7 +922,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
config_class = DebertaV2Config
base_model_prefix = "deberta"
_keys_to_ignore_on_load_missing = ["position_ids"]
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
supports_gradient_checkpointing = True
@ -1120,8 +1121,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -1380,8 +1379,6 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
)
# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1455,8 +1452,6 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
DEBERTA_START_DOCSTRING,
)
class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -476,8 +476,6 @@ class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
def __init__(self, config):
super().__init__(config)
@ -747,8 +745,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel):
base_model_prefix = "decision_transformer"
main_input_name = "states"
supports_gradient_checkpointing = False
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -1823,7 +1823,6 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
)
class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
_tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
def __init__(self, config: DeformableDetrConfig):

View File

@ -1775,7 +1775,6 @@ class DetaModel(DetaPreTrainedModel):
)
class DetaForObjectDetection(DetaPreTrainedModel):
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
_tied_weights_keys = [r"bbox_embed\.\d+"]
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta

View File

@ -595,7 +595,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING,
)
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
_tied_weights_keys = ["vocab_projector.weight"]
def __init__(self, config: PretrainedConfig):

View File

@ -296,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "ctx_encoder"
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
@ -309,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "question_encoder"
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
class DPRPretrainedReader(DPRPreTrainedModel):
@ -322,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel):
config_class = DPRConfig
load_tf_weights = None
base_model_prefix = "span_predictor"
_keys_to_ignore_on_load_missing = [r"position_ids"]
###############

View File

@ -161,7 +161,9 @@ class ElectraEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_electra
base_model_prefix = "electra"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
_keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"]
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
@ -1166,7 +1166,6 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
ELECTRA_START_DOCSTRING,
)
class ElectraForMaskedLM(ElectraPreTrainedModel):
_keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
_tied_weights_keys = ["generator_lm_head.weight"]
def __init__(self, config):
@ -1534,7 +1533,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
"""ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
)
class ElectraForCausalLM(ElectraPreTrainedModel):
_keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
_tied_weights_keys = ["generator_lm_head.weight"]
def __init__(self, config):

View File

@ -89,7 +89,9 @@ class ErnieEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel):
config_class = ErnieConfig
base_model_prefix = "ernie"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -983,7 +984,6 @@ class ErnieModel(ErniePreTrainedModel):
ERNIE_START_DOCSTRING,
)
class ErnieForPreTraining(ErniePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
@ -1095,8 +1095,6 @@ class ErnieForPreTraining(ErniePreTrainedModel):
"""Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
)
class ErnieForCausalLM(ErniePreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
@ -1243,8 +1241,6 @@ class ErnieForCausalLM(ErniePreTrainedModel):
@add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
class ErnieForMaskedLM(ErniePreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
@ -1665,8 +1661,6 @@ class ErnieForMultipleChoice(ErniePreTrainedModel):
ERNIE_START_DOCSTRING,
)
class ErnieForTokenClassification(ErniePreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
def __init__(self, config):
super().__init__(config)
@ -1746,8 +1740,6 @@ class ErnieForTokenClassification(ErniePreTrainedModel):
ERNIE_START_DOCSTRING,
)
class ErnieForQuestionAnswering(ErniePreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
def __init__(self, config):
super().__init__(config)

View File

@ -412,7 +412,6 @@ class ErnieMPreTrainedModel(PreTrainedModel):
config_class = ErnieMConfig
base_model_prefix = "ernie_m"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -96,7 +96,7 @@ class RotaryEmbedding(torch.nn.Module):
# Generate and save the inverse frequency buffer (non trainable)
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
inv_freq = inv_freq
self.register_buffer("inv_freq", inv_freq)
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._seq_len_cached = None
self._cos_cached = None
@ -178,7 +178,9 @@ class EsmEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
@ -783,7 +785,6 @@ class EsmModel(EsmPreTrainedModel):
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = False
def __init__(self, config, add_pooling_layer=True):
@ -960,8 +961,6 @@ class EsmModel(EsmPreTrainedModel):
@add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING)
class EsmForMaskedLM(EsmPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight"]
def __init__(self, config):
@ -1081,8 +1080,6 @@ class EsmLMHead(nn.Module):
ESM_START_DOCSTRING,
)
class EsmForSequenceClassification(EsmPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1177,9 +1174,6 @@ class EsmForSequenceClassification(EsmPreTrainedModel):
ESM_START_DOCSTRING,
)
class EsmForTokenClassification(EsmPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -378,8 +378,6 @@ class FlaubertPreTrainedModel(PreTrainedModel):
class FlaubertModel(FlaubertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): # , dico, is_encoder, with_output):
super().__init__(config)
@ -448,7 +446,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.layerdrop = getattr(config, "layerdrop", 0.0)
self.pre_norm = getattr(config, "pre_norm", False)
@ -654,7 +651,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
)
# Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
_tied_weights_keys = ["pred_layer.proj.weight"]
def __init__(self, config):

View File

@ -387,7 +387,9 @@ class FlavaTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -1724,12 +1726,6 @@ class FlavaGlobalContrastiveHead(nn.Module):
)
class FlavaForPreTraining(FlavaPreTrainedModel):
# Those are linked to xxx.bias
_keys_to_ignore_on_load_missing = [
"mmm_text_head.decoder.bias",
"mmm_image_head.decoder.bias",
"mlm_head.decoder.bias",
"mim_head.decoder.bias",
]
_tied_weights_keys = [
"mmm_text_head.decoder.bias",
"mmm_image_head.decoder.bias",

View File

@ -114,7 +114,9 @@ class FNetEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@ -411,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel):
config_class = FNetConfig
base_model_prefix = "fnet"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -621,7 +622,6 @@ class FNetModel(FNetPreTrainedModel):
FNET_START_DOCSTRING,
)
class FNetForPreTraining(FNetPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
@ -716,7 +716,6 @@ class FNetForPreTraining(FNetPreTrainedModel):
@add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING)
class FNetForMaskedLM(FNetPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):

View File

@ -1034,7 +1034,6 @@ def _get_shape(t):
FSMT_START_DOCSTRING,
)
class FSMTModel(PretrainedFSMTModel):
_keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight"]
def __init__(self, config: FSMTConfig):
@ -1172,15 +1171,6 @@ class FSMTModel(PretrainedFSMTModel):
)
class FSMTForConditionalGeneration(PretrainedFSMTModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
"model.encoder.embed_positions.weight",
"model.decoder.embed_positions.weight",
"decoder.output_projection.weight",
]
_keys_to_ignore_on_save = [
"model.encoder.embed_positions.weight",
"model.decoder.embed_positions.weight",
]
_tied_weights_keys = ["model.decoder.embed_tokens.weight"]
def __init__(self, config: FSMTConfig):

View File

@ -1190,7 +1190,6 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
@add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
class FunnelForMaskedLM(FunnelPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: FunnelConfig) -> None:

View File

@ -109,7 +109,9 @@ class GitEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel):
config_class = GitConfig
base_model_prefix = "git"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -623,7 +624,7 @@ class GitVisionEmbeddings(nn.Module):
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]

View File

@ -668,9 +668,6 @@ DEPARALLELIZE_DOCSTRING = r"""
GPT2_START_DOCSTRING,
)
class GPT2Model(GPT2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
def __init__(self, config):
super().__init__(config)
@ -957,8 +954,6 @@ class GPT2Model(GPT2PreTrainedModel):
GPT2_START_DOCSTRING,
)
class GPT2LMHeadModel(GPT2PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -1151,8 +1146,6 @@ input sequence).
GPT2_START_DOCSTRING,
)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -1381,9 +1374,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
GPT2_START_DOCSTRING,
)
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1605,9 +1595,6 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
GPT2_START_DOCSTRING,
)
class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -500,8 +500,6 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeModel(GPTBigCodePreTrainedModel):
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
def __init__(self, config):
super().__init__(config)
self.multi_query = config.multi_query
@ -722,7 +720,6 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -876,8 +873,6 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
GPT_BIGCODE_START_DOCSTRING,
)
class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -145,8 +145,8 @@ class GPTNeoSelfAttention(nn.Module):
if attention_type == "local":
bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
self.register_buffer("bias", bias)
self.register_buffer("masked_bias", torch.tensor(-1e9))
self.register_buffer("bias", bias, persistent=False)
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
self.attn_dropout = nn.Dropout(float(config.attention_dropout))
self.resid_dropout = nn.Dropout(float(config.resid_dropout))
@ -663,12 +663,6 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"h\.\d+\.attn\.masked_bias",
r"lm_head.weight",
r"h\.\d+\.attn\.attention\.bias",
]
_keys_to_ignore_on_save = [r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -820,8 +814,6 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1025,8 +1017,6 @@ class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
GPT_NEO_START_DOCSTRING,
)
class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -100,8 +100,9 @@ class GPTNeoXAttention(nn.Module):
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e9))
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
self.rotary_emb = RotaryEmbedding(
self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
)
@ -600,7 +601,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
"""GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
)
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
_tied_weights_keys = ["embed_out.weight"]
def __init__(self, config):
@ -775,8 +775,6 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
GPT_NEOX_START_DOCSTRING,
)
class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -971,8 +969,6 @@ class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
GPT_NEOX_START_DOCSTRING,
)
class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -591,7 +591,6 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
GPT_NEOX_JAPANESE_START_DOCSTRING,
)
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
_tied_weights_keys = ["embed_out.weight"]
def __init__(self, config):

View File

@ -734,7 +734,6 @@ class GPTJModel(GPTJPreTrainedModel):
GPTJ_START_DOCSTRING,
)
class GPTJForCausalLM(GPTJPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -933,8 +932,6 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
GPTJ_START_DOCSTRING,
)
class GPTJForSequenceClassification(GPTJPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1059,8 +1056,6 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
GPTJ_START_DOCSTRING,
)
class GPTJForQuestionAnswering(GPTJPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -1111,7 +1111,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
GPTSAN_JAPANESE_START_DOCSTRING,
)
class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: GPTSanJapaneseConfig):

View File

@ -714,7 +714,6 @@ class GraphormerPreTrainedModel(PreTrainedModel):
config_class = GraphormerConfig
base_model_prefix = "graphormer"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
main_input_name_nodes = "input_nodes"
main_input_name_edges = "input_edges"

View File

@ -450,7 +450,9 @@ class GroupViTTextEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -767,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel):
config_class = GroupViTConfig
base_model_prefix = "groupvit"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -869,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel):
base_model_prefix = "hubert"
main_input_name = "input_values"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -80,7 +80,9 @@ class IBertEmbeddings(nn.Module):
)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# End copy
@ -740,8 +742,6 @@ class IBertModel(IBertPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
@ -854,8 +854,6 @@ class IBertModel(IBertPreTrainedModel):
@add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING)
class IBertForMaskedLM(IBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
def __init__(self, config):
@ -969,8 +967,6 @@ class IBertLMHead(nn.Module):
IBERT_START_DOCSTRING,
)
class IBertForSequenceClassification(IBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1064,8 +1060,6 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
IBERT_START_DOCSTRING,
)
class IBertForMultipleChoice(IBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
@ -1156,9 +1150,6 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
IBERT_START_DOCSTRING,
)
class IBertForTokenClassification(IBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1256,9 +1247,6 @@ class IBertClassificationHead(nn.Module):
IBERT_START_DOCSTRING,
)
class IBertForQuestionAnswering(IBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -183,8 +183,9 @@ class ImageGPTAttention(nn.Module):
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e4))
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
@ -613,8 +614,6 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
IMAGEGPT_START_DOCSTRING,
)
class ImageGPTModel(ImageGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
def __init__(self, config: ImageGPTConfig):
super().__init__(config)
@ -893,7 +892,6 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
IMAGEGPT_START_DOCSTRING,
)
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: ImageGPTConfig):
@ -1085,8 +1083,6 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
IMAGEGPT_START_DOCSTRING,
)
class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config: ImageGPTConfig):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -602,7 +602,6 @@ Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/
class JukeboxVQVAE(PreTrainedModel):
config_class = JukeboxVQVAEConfig
base_model_prefix = "vqvae"
_keys_to_ignore_on_load_unexpected = [r"priors"]
def _init_weights(self, module):
if isinstance(module, nn.Embedding): # embed_tokens
@ -1792,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel):
"""
config_class = JukeboxPriorConfig
_keys_to_ignore_on_load_unexpected = ["vqvae"]
def _init_weights(self, module):
init_scale = self.config.init_scale
@ -1832,7 +1830,6 @@ class JukeboxPrior(PreTrainedModel):
self.level = level if level is not None else config.level
self.base_model_prefix = f"priors.{self.level}"
self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
self.n_ctx = config.n_ctx

View File

@ -68,7 +68,9 @@ class LayoutLMEmbeddings(nn.Module):
self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -619,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "layoutlm"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -857,11 +858,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
_keys_to_ignore_on_load_missing = [
"cls.predictions.decoder.bias",
"cls.predictions.decoder.weight",
"embeddings.position_ids",
]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):

View File

@ -77,7 +77,9 @@ class LayoutLMv2Embeddings(nn.Module):
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def _calc_spatial_position_embeddings(self, bbox):
try:
@ -506,7 +508,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
config_class = LayoutLMv2Config
pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "layoutlmv2"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -567,8 +568,11 @@ class LayoutLMv2VisualBackbone(nn.Module):
self.register_buffer(
"pixel_mean",
torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
persistent=False,
)
self.register_buffer(
"pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
)
self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1))
self.out_feature_key = "p2"
if torch.are_deterministic_algorithms_enabled():
logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")

View File

@ -245,7 +245,9 @@ class LayoutLMv3TextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
@ -750,8 +752,6 @@ class LayoutLMv3Output(nn.Module):
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.config = config
@ -1038,9 +1038,6 @@ class LayoutLMv3ClassificationHead(nn.Module):
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1153,9 +1150,6 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1286,8 +1280,6 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
LAYOUTLMV3_START_DOCSTRING,
)
class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -2209,7 +2209,6 @@ class LEDDecoder(LEDPreTrainedModel):
LED_START_DOCSTRING,
)
class LEDModel(LEDPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: LEDConfig):
@ -2335,14 +2334,7 @@ class LEDModel(LEDPreTrainedModel):
)
class LEDForConditionalGeneration(LEDPreTrainedModel):
base_model_prefix = "led"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
"decoder.embed_tokens.weight",
"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: LEDConfig):
@ -2530,7 +2522,6 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
LED_START_DOCSTRING,
)
class LEDForSequenceClassification(LEDPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config: LEDConfig, **kwargs):
@ -2667,7 +2658,6 @@ class LEDForSequenceClassification(LEDPreTrainedModel):
LED_START_DOCSTRING,
)
class LEDForQuestionAnswering(LEDPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
def __init__(self, config):

View File

@ -195,7 +195,9 @@ class LevitAttention(nn.Module):
self.attention_bias_cache = {}
self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points))
self.register_buffer(
"attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False
)
@torch.no_grad()
def train(self, mode=True):
@ -271,7 +273,9 @@ class LevitAttentionSubsample(nn.Module):
indices.append(attention_offsets[offset])
self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
self.register_buffer(
"attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False
)
@torch.no_grad()
def train(self, mode=True):

View File

@ -59,7 +59,9 @@ class LiltTextEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
# End copy
@ -610,15 +612,6 @@ class LiltPreTrainedModel(PreTrainedModel):
if isinstance(module, LiltEncoder):
module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
LILT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@ -697,8 +690,6 @@ LILT_INPUTS_DOCSTRING = r"""
LILT_START_DOCSTRING,
)
class LiltModel(LiltPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
@ -847,8 +838,6 @@ class LiltModel(LiltPreTrainedModel):
LILT_START_DOCSTRING,
)
class LiltForSequenceClassification(LiltPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt
def __init__(self, config):
super().__init__(config)
@ -967,9 +956,6 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
LILT_START_DOCSTRING,
)
class LiltForTokenClassification(LiltPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt
def __init__(self, config):
super().__init__(config)
@ -1096,9 +1082,6 @@ class LiltClassificationHead(nn.Module):
LILT_START_DOCSTRING,
)
class LiltForQuestionAnswering(LiltPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt
def __init__(self, config):
super().__init__(config)

View File

@ -344,7 +344,6 @@ class LlamaPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
def _init_weights(self, module):
std = self.config.initializer_range
@ -784,8 +783,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
LLAMA_START_DOCSTRING,
)
class LlamaForSequenceClassification(LlamaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -1421,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel):
config_class = LongformerConfig
base_model_prefix = "longformer"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
_no_split_modules = ["LongformerSelfAttention"]
def _init_weights(self, module):
@ -1770,8 +1769,6 @@ class LongformerModel(LongformerPreTrainedModel):
@add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING)
class LongformerForMaskedLM(LongformerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.decoder"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder"]
def __init__(self, config):
@ -1886,8 +1883,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
LONGFORMER_START_DOCSTRING,
)
class LongformerForSequenceClassification(LongformerPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -2015,8 +2010,6 @@ class LongformerClassificationHead(nn.Module):
LONGFORMER_START_DOCSTRING,
)
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -2154,8 +2147,6 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
LONGFORMER_START_DOCSTRING,
)
class LongformerForTokenClassification(LongformerPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -1763,10 +1763,6 @@ num_heads)`.
LONGT5_START_DOCSTRING,
)
class LongT5Model(LongT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
@ -1917,11 +1913,6 @@ class LongT5Model(LongT5PreTrainedModel):
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
@ -2160,7 +2151,6 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
LONGT5_START_DOCSTRING,
)
class LongT5EncoderModel(LongT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"]
def __init__(self, config: LongT5Config):

View File

@ -1022,8 +1022,6 @@ LUKE_INPUTS_DOCSTRING = r"""
LUKE_START_DOCSTRING,
)
class LukeModel(LukePreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
super().__init__(config)
self.config = config
@ -1278,17 +1276,6 @@ class LukeLMHead(nn.Module):
LUKE_START_DOCSTRING,
)
class LukeForMaskedLM(LukePreTrainedModel):
_keys_to_ignore_on_save = [
r"lm_head.decoder.weight",
r"lm_head.decoder.bias",
r"entity_predictions.decoder.weight",
]
_keys_to_ignore_on_load_missing = [
r"position_ids",
r"lm_head.decoder.weight",
r"lm_head.decoder.bias",
r"entity_predictions.decoder.weight",
]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
def __init__(self, config):

View File

@ -1018,7 +1018,6 @@ class LxmertModel(LxmertPreTrainedModel):
LXMERT_START_DOCSTRING,
)
class LxmertForPreTraining(LxmertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight"]
def __init__(self, config):

View File

@ -131,7 +131,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
# in forward put the weights on the correct dtype and device of the param
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@ -1137,14 +1137,6 @@ class M2M100Decoder(M2M100PreTrainedModel):
M2M_100_START_DOCSTRING,
)
class M2M100Model(M2M100PreTrainedModel):
_keys_to_ignore_on_load_missing = [
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
"encoder.embed_positions.weights",
"encoder.embed_positions.bias",
"decoder.embed_positions.weights",
"decoder.embed_positions.bias",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: M2M100Config):
@ -1258,17 +1250,6 @@ class M2M100Model(M2M100PreTrainedModel):
)
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"encoder.embed_positions.weights",
r"encoder.embed_positions.bias",
r"decoder.embed_positions.weights",
r"decoder.embed_positions.bias",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: M2M100Config):

View File

@ -1103,7 +1103,6 @@ class MarianDecoder(MarianPreTrainedModel):
"The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING
)
class MarianModel(MarianPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MarianConfig):
@ -1292,13 +1291,9 @@ class MarianModel(MarianPreTrainedModel):
class MarianMTModel(MarianPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"embed_positions",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
"final_logits_bias",
"encoder.embed_positions.weight",
"decoder.embed_positions.weight",
]
_keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"]
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
@ -1561,7 +1556,6 @@ class MarianDecoderWrapper(MarianPreTrainedModel):
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
class MarianForCausalLM(MarianPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -143,7 +143,9 @@ class MarkupLMEmbeddings(nn.Module):
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
@ -713,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
config_class = MarkupLMConfig
pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "markuplm"
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM
def _init_weights(self, module):
@ -971,8 +972,6 @@ class MarkupLMModel(MarkupLMPreTrainedModel):
MARKUPLM_START_DOCSTRING,
)
class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM
def __init__(self, config):
super().__init__(config)

View File

@ -1156,7 +1156,6 @@ class MBartDecoder(MBartPreTrainedModel):
MBART_START_DOCSTRING,
)
class MBartModel(MBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MBartConfig):
@ -1277,14 +1276,7 @@ class MBartModel(MBartPreTrainedModel):
)
class MBartForConditionalGeneration(MBartPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: MBartConfig):
@ -1452,7 +1444,6 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
MBART_START_DOCSTRING,
)
class MBartForSequenceClassification(MBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
def __init__(self, config: MBartConfig, **kwargs):
@ -1582,7 +1573,6 @@ class MBartForSequenceClassification(MBartPreTrainedModel):
MBART_START_DOCSTRING,
)
class MBartForQuestionAnswering(MBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
def __init__(self, config):
@ -1716,7 +1706,6 @@ class MBartDecoderWrapper(MBartPreTrainedModel):
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
class MBartForCausalLM(MBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -149,7 +149,9 @@ class MCTCTEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
@ -443,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel):
config_class = MCTCTConfig
base_model_prefix = "mctct"
main_input_name = "input_features"
_keys_to_ignore_on_load_missing = ["position_ids"]
supports_gradient_checkpointing = True
def _init_weights(self, module):

View File

@ -1387,15 +1387,6 @@ class MegaPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
MEGA_START_DOCSTRING = r"""
@ -1474,8 +1465,6 @@ class MegaModel(MegaPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = []
def __init__(self, config: MegaConfig, add_pooling_layer=True):
super().__init__(config)
self.config = config
@ -1656,9 +1645,6 @@ class MegaModel(MegaPreTrainedModel):
"""MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
)
class MegaForCausalLM(MegaPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"]
_keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: MegaConfig):
@ -1678,9 +1664,6 @@ class MegaForCausalLM(MegaPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.weight"])
# Initialize weights and apply final processing
self.post_init()
@ -1821,9 +1804,6 @@ class MegaForCausalLM(MegaPreTrainedModel):
@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
class MegaForMaskedLM(MegaPreTrainedModel):
_keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"]
_keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["mlm_head.weight"]
def __init__(self, config: MegaConfig):
@ -1845,9 +1825,6 @@ class MegaForMaskedLM(MegaPreTrainedModel):
self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.dropout = nn.Dropout(config.dropout_prob)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["mlm_head.weight"])
# Initialize weights and apply final processing
self.post_init()
@ -1931,8 +1908,6 @@ class MegaForMaskedLM(MegaPreTrainedModel):
MEGA_START_DOCSTRING,
)
class MegaForSequenceClassification(MegaPreTrainedModel):
_keys_to_ignore_on_load_missing = []
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -2024,8 +1999,6 @@ class MegaForSequenceClassification(MegaPreTrainedModel):
MEGA_START_DOCSTRING,
)
class MegaForMultipleChoice(MegaPreTrainedModel):
_keys_to_ignore_on_load_missing = []
def __init__(self, config):
super().__init__(config)
@ -2111,9 +2084,6 @@ class MegaForMultipleChoice(MegaPreTrainedModel):
MEGA_START_DOCSTRING,
)
class MegaForTokenClassification(MegaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = []
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -2214,9 +2184,6 @@ class MegaClassificationHead(nn.Module):
MEGA_START_DOCSTRING,
)
class MegaForQuestionAnswering(MegaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = []
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -149,7 +149,9 @@ class MegatronBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def forward(
@ -713,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_megatron_bert
base_model_prefix = "bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1014,7 +1015,6 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
MEGATRON_BERT_START_DOCSTRING,
)
class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config, add_binary_head=True):
@ -1121,8 +1121,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
MEGATRON_BERT_START_DOCSTRING,
)
class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
@ -1267,8 +1265,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
@add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING)
class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
@ -1376,8 +1372,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
MEGATRON_BERT_START_DOCSTRING,
)
class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"predictions"]
def __init__(self, config):
super().__init__(config)
@ -1672,8 +1666,6 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
MEGATRON_BERT_START_DOCSTRING,
)
class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1752,8 +1744,6 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
MEGATRON_BERT_START_DOCSTRING,
)
class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -191,7 +191,9 @@ class MobileBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -686,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel):
pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
load_tf_weights = load_tf_weights_in_mobilebert
base_model_prefix = "mobilebert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -923,11 +924,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
MOBILEBERT_START_DOCSTRING,
)
class MobileBertForPreTraining(MobileBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [
"cls.predictions.decoder.weight",
"cls.predictions.decoder.bias",
"embeddings.position_ids",
]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -1036,12 +1032,6 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
class MobileBertForMaskedLM(MobileBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [
"cls.predictions.decoder.weight",
"cls.predictions.decoder.bias",
"embeddings.position_ids",
]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
@ -1350,8 +1340,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
)
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing
class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1553,8 +1541,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
)
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing
class MobileBertForTokenClassification(MobileBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -83,7 +83,9 @@ class MPNetEmbeddings(nn.Module):
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
if position_ids is None:
@ -479,8 +481,6 @@ MPNET_INPUTS_DOCSTRING = r"""
MPNET_START_DOCSTRING,
)
class MPNetModel(MPNetPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
@ -570,8 +570,6 @@ class MPNetModel(MPNetPreTrainedModel):
class MPNetForMaskedLM(MPNetPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder"]
def __init__(self, config):
@ -679,8 +677,6 @@ class MPNetLMHead(nn.Module):
MPNET_START_DOCSTRING,
)
class MPNetForSequenceClassification(MPNetPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
@ -773,8 +769,6 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel):
MPNET_START_DOCSTRING,
)
class MPNetForMultipleChoice(MPNetPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
@ -863,9 +857,6 @@ class MPNetForMultipleChoice(MPNetPreTrainedModel):
MPNET_START_DOCSTRING,
)
class MPNetForTokenClassification(MPNetPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -962,9 +953,6 @@ class MPNetClassificationHead(nn.Module):
MPNET_START_DOCSTRING,
)
class MPNetForQuestionAnswering(MPNetPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)

View File

@ -1316,18 +1316,8 @@ class MT5Model(MT5PreTrainedModel):
```"""
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_save = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
@ -1552,15 +1542,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_save = [
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
@ -1897,13 +1879,6 @@ class MT5EncoderModel(MT5PreTrainedModel):
model_type = "mt5"
config_class = MT5Config
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_save = [
r"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5
@ -2029,14 +2004,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
MT5_START_DOCSTRING,
)
class MT5ForQuestionAnswering(MT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5

View File

@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel):
config_class = MvpConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
def _init_weights(self, module):
std = self.config.init_std
@ -1300,8 +1299,7 @@ class MvpDecoder(MvpPreTrainedModel):
MVP_START_DOCSTRING,
)
class MvpModel(MvpPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_keys_to_ignore_on_load_unexpected = ["final_logits_bias"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MvpConfig):
@ -1438,7 +1436,6 @@ class MvpModel(MvpPreTrainedModel):
"The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
)
class MvpForConditionalGeneration(MvpPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: MvpConfig):
@ -1611,8 +1608,6 @@ class MvpForConditionalGeneration(MvpPreTrainedModel):
MVP_START_DOCSTRING,
)
class MvpForSequenceClassification(MvpPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MvpConfig, **kwargs):
@ -1740,8 +1735,6 @@ class MvpForSequenceClassification(MvpPreTrainedModel):
MVP_START_DOCSTRING,
)
class MvpForQuestionAnswering(MvpPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
@ -1873,7 +1866,6 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
class MvpForCausalLM(MvpPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -163,7 +163,7 @@ class NezhaRelativePositionsEncoding(nn.Module):
my_shape = list(final_mat.size())
my_shape.append(depth)
positions_encoding = positions_encoding.view(my_shape)
self.register_buffer("positions_encoding", positions_encoding)
self.register_buffer("positions_encoding", positions_encoding, persistent=False)
def forward(self, length):
return self.positions_encoding[:length, :length, :]
@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_nezha
base_model_prefix = "nezha"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"positions_encoding"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1037,7 +1036,6 @@ class NezhaModel(NezhaPreTrainedModel):
NEZHA_START_DOCSTRING,
)
class NezhaForPreTraining(NezhaPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
@ -1140,8 +1138,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel):
@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
class NezhaForMaskedLM(NezhaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
@ -1542,8 +1538,6 @@ class NezhaForMultipleChoice(NezhaPreTrainedModel):
NEZHA_START_DOCSTRING,
)
class NezhaForTokenClassification(NezhaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1623,8 +1617,6 @@ class NezhaForTokenClassification(NezhaPreTrainedModel):
NEZHA_START_DOCSTRING,
)
class NezhaForQuestionAnswering(NezhaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -183,7 +183,7 @@ class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
# in forward put the weights on the correct dtype and device of the param
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@ -1500,14 +1500,6 @@ class NllbMoeDecoder(NllbMoePreTrainedModel):
NLLB_MOE_START_DOCSTRING,
)
class NllbMoeModel(NllbMoePreTrainedModel):
_keys_to_ignore_on_load_missing = [
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
"encoder.embed_positions.weights",
"encoder.embed_positions.bias",
"decoder.embed_positions.weights",
"decoder.embed_positions.bias",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: NllbMoeConfig):
@ -1641,17 +1633,6 @@ class NllbMoeModel(NllbMoePreTrainedModel):
)
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"encoder.embed_positions.weights",
r"encoder.embed_positions.bias",
r"decoder.embed_positions.weights",
r"decoder.embed_positions.bias",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: NllbMoeConfig):

View File

@ -64,7 +64,9 @@ class NystromformerEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
@ -458,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel):
config_class = NystromformerConfig
base_model_prefix = "nystromformer"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -658,7 +659,6 @@ class NystromformerModel(NystromformerPreTrainedModel):
@add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING)
class NystromformerForMaskedLM(NystromformerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):

View File

@ -368,7 +368,6 @@ class OpenLlamaPreTrainedModel(PreTrainedModel):
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["OpenLlamaDecoderLayer"]
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
def _init_weights(self, module):
std = self.config.initializer_range
@ -825,8 +824,6 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama
class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -141,7 +141,9 @@ class Attention(nn.Module):
if n_state % config.n_head != 0:
raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
self.register_buffer(
"bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
"bias",
torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions),
persistent=False,
)
self.n_head = config.n_head
self.split_size = n_state
@ -274,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
config_class = OpenAIGPTConfig
load_tf_weights = load_tf_weights_in_openai_gpt
base_model_prefix = "transformer"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights."""
@ -407,7 +408,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
self.register_buffer("position_ids", torch.arange(config.n_positions))
self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False)
# Initialize weights and apply final processing
self.post_init()
@ -529,7 +530,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING,
)
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -621,7 +621,6 @@ input sequence).
OPENAI_GPT_START_DOCSTRING,
)
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -399,7 +399,6 @@ class OPTPreTrainedModel(PreTrainedModel):
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["OPTDecoderLayer"]
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
def _init_weights(self, module):
std = self.config.init_std
@ -817,7 +816,6 @@ class OPTModel(OPTPreTrainedModel):
class OPTForCausalLM(OPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
@ -1025,8 +1023,6 @@ class OPTForCausalLM(OPTPreTrainedModel):
OPT_START_DOCSTRING,
)
class OPTForSequenceClassification(OPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
def __init__(self, config: OPTConfig):
super().__init__(config)
self.num_labels = config.num_labels
@ -1147,8 +1143,6 @@ class OPTForSequenceClassification(OPTPreTrainedModel):
OPT_START_DOCSTRING,
)
class OPTForQuestionAnswering(OPTPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
def __init__(self, config: OPTConfig):
super().__init__(config)
self.model = OPTModel(config)

View File

@ -304,7 +304,7 @@ class OwlViTVisionEmbeddings(nn.Module):
self.num_patches = (config.image_size // config.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
@ -325,7 +325,9 @@ class OwlViTTextEmbeddings(nn.Module):
self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel):
config_class = OwlViTConfig
base_model_prefix = "owlvit"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
_no_split_modules = ["OwlViTEncoderLayer"]
def _init_weights(self, module):

View File

@ -1156,7 +1156,6 @@ class PegasusDecoder(PegasusPreTrainedModel):
PEGASUS_START_DOCSTRING,
)
class PegasusModel(PegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: PegasusConfig):
@ -1309,15 +1308,7 @@ class PegasusModel(PegasusPreTrainedModel):
)
class PegasusForConditionalGeneration(PegasusPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"embed_positions.weight",
"encoder.embed_tokens.weight",
"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: PegasusConfig):
@ -1518,7 +1509,6 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
class PegasusForCausalLM(PegasusPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -1391,7 +1391,6 @@ class PegasusXDecoder(PegasusXPreTrainedModel):
PEGASUS_X_START_DOCSTRING,
)
class PegasusXModel(PegasusXPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: PegasusXConfig):
@ -1536,14 +1535,6 @@ class PegasusXModel(PegasusXPreTrainedModel):
@add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING)
class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
r"embed_positions.weight",
"decoder.embed_tokens.weight",
"encoder.embed_tokens.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: PegasusXConfig):

View File

@ -1597,14 +1597,6 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel):
class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
config_class = Pix2StructConfig
main_input_name = "flattened_patches"
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["decoder.lm_head.weight"]
def __init__(self, config: Pix2StructConfig):

View File

@ -1132,7 +1132,6 @@ class PLBartDecoder(PLBartPreTrainedModel):
PLBART_START_DOCSTRING,
)
class PLBartModel(PLBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: PLBartConfig):
@ -1251,14 +1250,7 @@ class PLBartModel(PLBartPreTrainedModel):
)
class PLBartForConditionalGeneration(PLBartPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"final_logits_bias",
r"encoder.version",
r"decoder.version",
r"lm_head.weight",
"decoder.embed_tokens.weight",
"encoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: PLBartConfig):
@ -1423,7 +1415,6 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel):
PLBART_START_DOCSTRING,
)
class PLBartForSequenceClassification(PLBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: PLBartConfig, **kwargs):
@ -1562,7 +1553,6 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel):
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
class PLBartForCausalLM(PLBartPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):

View File

@ -1744,7 +1744,6 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
PROPHETNET_START_DOCSTRING,
)
class ProphetNetModel(ProphetNetPreTrainedModel):
_keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
def __init__(self, config: ProphetNetConfig):
@ -1874,11 +1873,6 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
PROPHETNET_START_DOCSTRING,
)
class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
_keys_to_ignore_on_load_missing = [
"decoder.word_embeddings.weight",
"encoder.word_embeddings.weight",
"lm_head.weight",
]
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
def __init__(self, config: ProphetNetConfig):
@ -2091,7 +2085,6 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
PROPHETNET_START_DOCSTRING,
)
class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: ProphetNetConfig):

View File

@ -164,7 +164,9 @@ class QDQBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -738,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_qdqbert
base_model_prefix = "bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1012,8 +1013,6 @@ class QDQBertModel(QDQBertPreTrainedModel):
"""QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
)
class QDQBertLMHeadModel(QDQBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
def __init__(self, config):
@ -1166,8 +1165,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
class QDQBertForMaskedLM(QDQBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
def __init__(self, config):
@ -1570,8 +1567,6 @@ class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
QDQBERT_START_DOCSTRING,
)
class QDQBertForTokenClassification(QDQBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
@ -1650,8 +1645,6 @@ class QDQBertForTokenClassification(QDQBertPreTrainedModel):
QDQBERT_START_DOCSTRING,
)
class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel):
"""
config_class = RagConfig
base_model_prefix = "rag"
_keys_to_ignore_on_load_missing = [r"position_ids"]
@classmethod
def from_pretrained(cls, *args, **kwargs):

View File

@ -178,7 +178,9 @@ class RealmEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel):
config_class = RealmConfig
load_tf_weights = load_tf_weights_in_realm
base_model_prefix = "realm"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1147,7 +1148,6 @@ class RealmBertModel(RealmPreTrainedModel):
REALM_START_DOCSTRING,
)
class RealmEmbedder(RealmPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.bias"]
def __init__(self, config):
@ -1378,7 +1378,6 @@ class RealmScorer(RealmPreTrainedModel):
REALM_START_DOCSTRING,
)
class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
_tied_weights_keys = ["cls.predictions.decoder"]
def __init__(self, config):
@ -1529,8 +1528,6 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
class RealmReader(RealmPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

View File

@ -352,10 +352,10 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False)
# save mask value here. Need fp32 and fp16 mask values
self.register_buffer("self_mask_value_float16", torch.tensor(-1e3))
self.register_buffer("self_mask_value_float32", torch.tensor(-1e5))
self.register_buffer("mask_value_float16", torch.tensor(-1e4))
self.register_buffer("mask_value_float32", torch.tensor(-1e9))
self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False)
self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False)
self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
def forward(
self,
@ -1049,8 +1049,8 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
self.dropout = config.local_attention_probs_dropout_prob
# save mask value here
self.register_buffer("mask_value_float16", torch.tensor(-1e4))
self.register_buffer("mask_value_float32", torch.tensor(-1e9))
self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
def forward(
self,
@ -2185,7 +2185,6 @@ class ReformerModel(ReformerPreTrainedModel):
@add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
class ReformerModelWithLMHead(ReformerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):

View File

@ -158,7 +158,9 @@ class RemBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
@ -654,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_rembert
base_model_prefix = "rembert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
@ -1016,7 +1017,6 @@ class RemBertForMaskedLM(RemBertPreTrainedModel):
"""RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
)
class RemBertForCausalLM(RemBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.weight"]
def __init__(self, config):

Some files were not shown because too many files have changed in this diff Show More