mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Clean load keys (#24505)
* Preliminary work on some models * Fix test load missing and make sure nonpersistent buffers are tested * Always ignore nonpersistent buffers if in state_dict * Treat models * More models * Treat remaining models * Fix quality * Fix tests * Remove draft * This test is not needed anymore * Fix copies * Fix last test * Newly added models * Fix last tests * Address review comments
This commit is contained in:
parent
53194991e9
commit
8e5d1619b3
@ -320,8 +320,9 @@ def shard_checkpoint(
|
||||
|
||||
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
|
||||
|
||||
# If this weight is going to tip up over the maximal size, we split.
|
||||
if last_block_size + weight_size > max_shard_size:
|
||||
# If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
|
||||
# weight in the current shard.
|
||||
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
|
||||
sharded_state_dicts.append({})
|
||||
last_block_size = 0
|
||||
|
||||
@ -3044,15 +3045,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
expected_keys = [".".join([prefix, s]) for s in expected_keys]
|
||||
|
||||
missing_keys = list(set(expected_keys) - set(loaded_keys))
|
||||
unexpected_keys = list(set(loaded_keys) - set(expected_keys))
|
||||
unexpected_keys = set(loaded_keys) - set(expected_keys)
|
||||
# Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model
|
||||
# buffers
|
||||
model_buffers = {n for n, _ in model.named_buffers()}
|
||||
if remove_prefix_from_model:
|
||||
model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers}
|
||||
elif add_prefix_to_model:
|
||||
model_buffers = {".".join([prefix, key]) for key in model_buffers}
|
||||
unexpected_keys = list(unexpected_keys - model_buffers)
|
||||
|
||||
if is_accelerate_available():
|
||||
model.tie_weights()
|
||||
tied_params = find_tied_parameters(model)
|
||||
else:
|
||||
tied_params = []
|
||||
model.tie_weights()
|
||||
ptrs = collections.defaultdict(list)
|
||||
for name, tensor in model.state_dict().items():
|
||||
id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor)
|
||||
ptrs[id_tensor].append(name)
|
||||
|
||||
# These are all the pointers of shared tensors.
|
||||
tied_params = [names for _, names in ptrs.items() if len(names) > 1]
|
||||
|
||||
for group in tied_params:
|
||||
if remove_prefix_from_model:
|
||||
group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group]
|
||||
elif add_prefix_to_model:
|
||||
group = [".".join([prefix, key]) for key in group]
|
||||
missing_in_group = [k for k in missing_keys if k in group]
|
||||
if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
|
||||
missing_keys = [k for k in missing_keys if k not in missing_in_group]
|
||||
|
@ -208,7 +208,9 @@ class AlbertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
@ -507,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
||||
config_class = AlbertConfig
|
||||
load_tf_weights = load_tf_weights_in_albert
|
||||
base_model_prefix = "albert"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights."""
|
||||
@ -760,11 +761,6 @@ class AlbertModel(AlbertPreTrainedModel):
|
||||
)
|
||||
class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"predictions.decoder.weight",
|
||||
"predictions.decoder.bias",
|
||||
"embeddings.position_ids",
|
||||
]
|
||||
|
||||
def __init__(self, config: AlbertConfig):
|
||||
super().__init__(config)
|
||||
@ -912,13 +908,7 @@ class AlbertSOPHead(nn.Module):
|
||||
ALBERT_START_DOCSTRING,
|
||||
)
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"predictions.decoder.weight",
|
||||
"predictions.decoder.bias",
|
||||
"embeddings.position_ids",
|
||||
]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1133,8 +1123,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
ALBERT_START_DOCSTRING,
|
||||
)
|
||||
class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config: AlbertConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1218,8 +1206,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
ALBERT_START_DOCSTRING,
|
||||
)
|
||||
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config: AlbertConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -687,7 +687,9 @@ class AlignTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -1176,7 +1178,6 @@ class AlignPreTrainedModel(PreTrainedModel):
|
||||
config_class = AlignConfig
|
||||
base_model_prefix = "align"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -216,7 +216,9 @@ class AltRobertaEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -1016,7 +1018,7 @@ class AltCLIPVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
@ -1038,7 +1040,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
|
||||
config_class = AltCLIPConfig
|
||||
base_model_prefix = "altclip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -506,7 +506,7 @@ class BartPretrainedModel(PreTrainedModel):
|
||||
config_class = BartConfig
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
|
||||
_keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
|
||||
_no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
|
||||
@ -1170,7 +1170,6 @@ class BartDecoder(BartPretrainedModel):
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartModel(BartPretrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BartConfig):
|
||||
@ -1300,12 +1299,7 @@ class BartModel(BartPretrainedModel):
|
||||
class BartForConditionalGeneration(BartPretrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"final_logits_bias",
|
||||
"lm_head.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
|
||||
def __init__(self, config: BartConfig):
|
||||
super().__init__(config)
|
||||
@ -1478,7 +1472,6 @@ class BartForConditionalGeneration(BartPretrainedModel):
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartForSequenceClassification(BartPretrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BartConfig, **kwargs):
|
||||
@ -1609,7 +1602,6 @@ class BartForSequenceClassification(BartPretrainedModel):
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartForQuestionAnswering(BartPretrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1748,7 +1740,6 @@ class BartDecoderWrapper(BartPretrainedModel):
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartForCausalLM(BartPretrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -459,7 +459,7 @@ class BeitRelativePositionBias(nn.Module):
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
self.register_buffer("relative_position_index", relative_position_index, persistent=False)
|
||||
|
||||
def forward(self) -> torch.Tensor:
|
||||
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
|
||||
|
@ -192,7 +192,9 @@ class BertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_bert
|
||||
base_model_prefix = "bert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1053,7 +1054,6 @@ class BertModel(BertPreTrainedModel):
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class BertForPreTraining(BertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1160,8 +1160,6 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
"""Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
|
||||
)
|
||||
class BertLMHeadModel(BertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1301,8 +1299,6 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
|
||||
class BertForMaskedLM(BertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1715,8 +1711,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class BertForTokenClassification(BertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1800,8 +1794,6 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -556,7 +556,9 @@ class BertGenerationEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
|
||||
if input_ids is not None:
|
||||
@ -588,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
|
||||
config_class = BertGenerationConfig
|
||||
base_model_prefix = "bert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -860,7 +861,6 @@ class BertGenerationOnlyLMHead(nn.Module):
|
||||
BERT_GENERATION_START_DOCSTRING,
|
||||
)
|
||||
class BertGenerationDecoder(BertGenerationPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -257,7 +257,9 @@ class BigBirdEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -1765,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_big_bird
|
||||
base_model_prefix = "bert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -2261,7 +2262,6 @@ class BigBirdModel(BigBirdPreTrainedModel):
|
||||
|
||||
|
||||
class BigBirdForPreTraining(BigBirdPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -2368,7 +2368,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
|
||||
class BigBirdForMaskedLM(BigBirdPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -2513,12 +2512,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
|
||||
"""BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
|
||||
)
|
||||
class BigBirdForCausalLM(BigBirdPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"position_ids",
|
||||
r"predictions.decoder.bias",
|
||||
"cls.predictions.decoder.weight",
|
||||
"cls.predictions.decoder.bias",
|
||||
]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -2358,7 +2358,6 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
|
||||
BIGBIRD_PEGASUS_START_DOCSTRING,
|
||||
)
|
||||
class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BigBirdPegasusConfig):
|
||||
@ -2491,12 +2490,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
|
||||
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"final_logits_bias",
|
||||
"lm_head.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
|
||||
def __init__(self, config: BigBirdPegasusConfig):
|
||||
super().__init__(config)
|
||||
@ -2669,7 +2663,6 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
|
||||
BIGBIRD_PEGASUS_START_DOCSTRING,
|
||||
)
|
||||
class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BigBirdPegasusConfig, **kwargs):
|
||||
@ -2799,7 +2792,6 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
|
||||
BIGBIRD_PEGASUS_START_DOCSTRING,
|
||||
)
|
||||
class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -2932,7 +2924,6 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
|
||||
|
||||
|
||||
class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -646,7 +646,6 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
"""BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
|
||||
)
|
||||
class BioGptForCausalLM(BioGptPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["output_projection.weight"]
|
||||
_tied_weights_keys = ["output_projection.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1102,7 +1102,6 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
|
||||
BLENDERBOT_START_DOCSTRING,
|
||||
)
|
||||
class BlenderbotModel(BlenderbotPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BlenderbotConfig):
|
||||
@ -1244,14 +1243,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
|
||||
)
|
||||
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BlenderbotConfig):
|
||||
@ -1441,7 +1433,6 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
|
||||
class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1096,7 +1096,6 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
|
||||
BLENDERBOT_SMALL_START_DOCSTRING,
|
||||
)
|
||||
class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: BlenderbotSmallConfig):
|
||||
@ -1226,14 +1225,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
|
||||
)
|
||||
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BlenderbotSmallConfig):
|
||||
@ -1408,7 +1400,6 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
|
||||
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -255,7 +255,9 @@ class BlipTextEmbeddings(nn.Module):
|
||||
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -419,7 +421,6 @@ class BlipPreTrainedModel(PreTrainedModel):
|
||||
config_class = BlipConfig
|
||||
base_model_prefix = "blip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -927,7 +928,6 @@ class BlipModel(BlipPreTrainedModel):
|
||||
)
|
||||
class BlipForConditionalGeneration(BlipPreTrainedModel):
|
||||
config_class = BlipConfig
|
||||
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
@ -1100,7 +1100,6 @@ class BlipForConditionalGeneration(BlipPreTrainedModel):
|
||||
)
|
||||
class BlipForQuestionAnswering(BlipPreTrainedModel):
|
||||
config_class = BlipConfig
|
||||
_keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config: BlipConfig):
|
||||
|
@ -56,7 +56,9 @@ class BlipTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
self.config = config
|
||||
@ -552,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = BlipTextConfig
|
||||
base_model_prefix = "bert"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -808,9 +809,6 @@ class BlipTextModel(BlipTextPreTrainedModel):
|
||||
|
||||
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
|
||||
class BlipTextLMHeadModel(BlipTextPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
|
@ -273,12 +273,6 @@ class Blip2PreTrainedModel(PreTrainedModel):
|
||||
config_class = Blip2Config
|
||||
base_model_prefix = "blip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"position_ids",
|
||||
r"language_model.encoder.embed_tokens.weight",
|
||||
r"language_model.decoder.embed_tokens.weight",
|
||||
r"language_model.lm_head.weight",
|
||||
]
|
||||
_no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
_keep_in_fp32_modules = ["wo"]
|
||||
|
@ -471,12 +471,6 @@ class BloomBlock(nn.Module):
|
||||
|
||||
|
||||
class BloomPreTrainedModel(PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = BloomConfig
|
||||
base_model_prefix = "transformer"
|
||||
supports_gradient_checkpointing = True
|
||||
@ -826,7 +820,6 @@ class BloomModel(BloomPreTrainedModel):
|
||||
BLOOM_START_DOCSTRING,
|
||||
)
|
||||
class BloomForCausalLM(BloomPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BloomConfig):
|
||||
@ -995,8 +988,6 @@ class BloomForCausalLM(BloomPreTrainedModel):
|
||||
BLOOM_START_DOCSTRING,
|
||||
)
|
||||
class BloomForSequenceClassification(BloomPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BloomConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1123,8 +1114,6 @@ class BloomForSequenceClassification(BloomPreTrainedModel):
|
||||
BLOOM_START_DOCSTRING,
|
||||
)
|
||||
class BloomForTokenClassification(BloomPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BloomConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1226,8 +1215,6 @@ class BloomForTokenClassification(BloomPreTrainedModel):
|
||||
BLOOM_START_DOCSTRING,
|
||||
)
|
||||
class BloomForQuestionAnswering(BloomPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.transformer = BloomModel(config)
|
||||
|
@ -280,7 +280,7 @@ class BridgeTowerVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
@ -880,7 +880,9 @@ class BridgeTowerTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -1038,8 +1040,6 @@ class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
|
||||
|
||||
config_class = BridgeTowerTextConfig
|
||||
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
@ -94,7 +94,9 @@ class CamembertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -627,15 +629,6 @@ class CamembertPreTrainedModel(PreTrainedModel):
|
||||
if isinstance(module, CamembertEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
def update_keys_to_ignore(self, config, del_keys_to_ignore):
|
||||
"""Remove some keys from ignore list"""
|
||||
if not config.tie_word_embeddings:
|
||||
# must make a new list, or the class variable gets modified!
|
||||
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
|
||||
self._keys_to_ignore_on_load_missing = [
|
||||
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
|
||||
]
|
||||
|
||||
|
||||
CAMEMBERT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel):
|
||||
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_no_split_modules = []
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert
|
||||
@ -935,9 +927,6 @@ class CamembertModel(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
class CamembertForMaskedLM(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -952,9 +941,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
|
||||
self.roberta = CamembertModel(config, add_pooling_layer=False)
|
||||
self.lm_head = CamembertLMHead(config)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -1042,8 +1028,6 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
class CamembertForSequenceClassification(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1144,8 +1128,6 @@ class CamembertForSequenceClassification(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
class CamembertForMultipleChoice(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1241,9 +1223,6 @@ class CamembertForMultipleChoice(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
class CamembertForTokenClassification(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1330,9 +1309,6 @@ class CamembertForTokenClassification(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
class CamembertForQuestionAnswering(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1431,9 +1407,6 @@ class CamembertForQuestionAnswering(CamembertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base
|
||||
class CamembertForCausalLM(CamembertPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1445,9 +1418,6 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
|
||||
self.roberta = CamembertModel(config, add_pooling_layer=False)
|
||||
self.lm_head = CamembertLMHead(config)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
|
@ -216,7 +216,9 @@ class CanineEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
|
||||
@ -900,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_canine
|
||||
base_model_prefix = "canine"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -121,7 +121,9 @@ class ChineseCLIPTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -190,7 +192,7 @@ class ChineseCLIPVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
@ -689,7 +691,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
|
||||
config_class = ChineseCLIPConfig
|
||||
base_model_prefix = "chinese_clip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -1166,7 +1166,9 @@ class ClapTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
|
||||
)
|
||||
@ -1677,7 +1679,6 @@ class ClapPreTrainedModel(PreTrainedModel):
|
||||
config_class = ClapConfig
|
||||
base_model_prefix = "clap"
|
||||
supports_gradient_checkpointing = False
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1781,7 +1782,6 @@ class ClapTextModel(ClapPreTrainedModel):
|
||||
"""
|
||||
|
||||
config_class = ClapTextConfig
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
@ -1936,7 +1936,6 @@ class ClapTextModel(ClapPreTrainedModel):
|
||||
@add_start_docstrings(CLAP_START_DOCSTRING)
|
||||
class ClapModel(ClapPreTrainedModel):
|
||||
config_class = ClapConfig
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config: ClapConfig):
|
||||
super().__init__(config)
|
||||
|
@ -188,7 +188,7 @@ class CLIPVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
@ -210,7 +210,9 @@ class CLIPTextEmbeddings(nn.Module):
|
||||
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -410,7 +412,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
|
||||
config_class = CLIPConfig
|
||||
base_model_prefix = "clip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -181,7 +181,7 @@ class CLIPSegVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def interpolate_position_embeddings(self, new_size):
|
||||
if len(new_size) != 2:
|
||||
@ -230,7 +230,9 @@ class CLIPSegTextEmbeddings(nn.Module):
|
||||
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
|
||||
config_class = CLIPSegConfig
|
||||
base_model_prefix = "clip"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -83,6 +83,7 @@ class CodeGenAttention(nn.Module):
|
||||
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
||||
1, 1, max_positions, max_positions
|
||||
),
|
||||
persistent=False,
|
||||
)
|
||||
|
||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||
@ -600,7 +601,6 @@ class CodeGenModel(CodeGenPreTrainedModel):
|
||||
CODEGEN_START_DOCSTRING,
|
||||
)
|
||||
class CodeGenForCausalLM(CodeGenPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -191,7 +191,9 @@ class ConvBertEmbeddings(nn.Module):
|
||||
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -245,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_convbert
|
||||
base_model_prefix = "convbert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -765,8 +765,6 @@ CONVBERT_INPUTS_DOCSTRING = r"""
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class ConvBertModel(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.embeddings = ConvBertEmbeddings(config)
|
||||
@ -880,7 +878,6 @@ class ConvBertGeneratorPredictions(nn.Module):
|
||||
|
||||
@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
|
||||
class ConvBertForMaskedLM(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"]
|
||||
_tied_weights_keys = ["generator.lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -992,8 +989,6 @@ class ConvBertClassificationHead(nn.Module):
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1089,8 +1084,6 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1184,8 +1177,6 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class ConvBertForTokenClassification(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1267,8 +1258,6 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
|
@ -537,7 +537,6 @@ class CpmAntPreTrainedModel(PreTrainedModel):
|
||||
config_class = CpmAntConfig
|
||||
base_model_prefix = "cpmant"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -749,7 +748,6 @@ class CpmAntModel(CpmAntPreTrainedModel):
|
||||
CPMANT_START_DOCSTRING,
|
||||
)
|
||||
class CpmAntForCausalLM(CpmAntPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: CpmAntConfig):
|
||||
|
@ -509,7 +509,6 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
CTRL_START_DOCSTRING,
|
||||
)
|
||||
class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -689,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
|
||||
config_class = Data2VecAudioConfig
|
||||
base_model_prefix = "data2vec_audio"
|
||||
main_input_name = "input_values"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
|
@ -80,7 +80,9 @@ class Data2VecTextForTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -615,15 +617,6 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
|
||||
if isinstance(module, Data2VecTextEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
def update_keys_to_ignore(self, config, del_keys_to_ignore):
|
||||
"""Remove some keys from ignore list"""
|
||||
if not config.tie_word_embeddings:
|
||||
# must make a new list, or the class variable gets modified!
|
||||
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
|
||||
self._keys_to_ignore_on_load_missing = [
|
||||
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
|
||||
]
|
||||
|
||||
|
||||
DATA2VECTEXT_START_DOCSTRING = r"""
|
||||
Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
|
||||
@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
|
||||
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -883,9 +874,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
|
||||
"""Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
|
||||
)
|
||||
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -897,9 +885,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
|
||||
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
|
||||
self.lm_head = Data2VecTextLMHead(config)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -1038,9 +1023,6 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
|
||||
class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1055,9 +1037,6 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
|
||||
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
|
||||
self.lm_head = Data2VecTextLMHead(config)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -1174,8 +1153,6 @@ class Data2VecTextLMHead(nn.Module):
|
||||
DATA2VECTEXT_START_DOCSTRING,
|
||||
)
|
||||
class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1273,8 +1250,6 @@ class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
|
||||
DATA2VECTEXT_START_DOCSTRING,
|
||||
)
|
||||
class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1369,9 +1344,6 @@ class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
|
||||
DATA2VECTEXT_START_DOCSTRING,
|
||||
)
|
||||
class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1478,9 +1450,6 @@ class Data2VecTextClassificationHead(nn.Module):
|
||||
DATA2VECTEXT_START_DOCSTRING,
|
||||
)
|
||||
class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -470,7 +470,7 @@ class Data2VecVisionRelativePositionBias(nn.Module):
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
self.register_buffer("relative_position_index", relative_position_index, persistent=False)
|
||||
|
||||
def forward(self) -> torch.Tensor:
|
||||
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
|
||||
|
@ -764,7 +764,9 @@ class DebertaEmbeddings(nn.Module):
|
||||
self.config = config
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
|
||||
if input_ids is not None:
|
||||
@ -821,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = DebertaConfig
|
||||
base_model_prefix = "deberta"
|
||||
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
@ -1020,8 +1021,6 @@ class DebertaModel(DebertaPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
|
||||
class DebertaForMaskedLM(DebertaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1277,8 +1276,6 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaForTokenClassification(DebertaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1352,8 +1349,6 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaForQuestionAnswering(DebertaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -862,7 +862,9 @@ class DebertaV2Embeddings(nn.Module):
|
||||
self.config = config
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
|
||||
if input_ids is not None:
|
||||
@ -920,7 +922,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = DebertaV2Config
|
||||
base_model_prefix = "deberta"
|
||||
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
@ -1120,8 +1121,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
|
||||
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1380,8 +1379,6 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
|
||||
class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1455,8 +1452,6 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -476,8 +476,6 @@ class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -747,8 +745,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel):
|
||||
base_model_prefix = "decision_transformer"
|
||||
main_input_name = "states"
|
||||
supports_gradient_checkpointing = False
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -1823,7 +1823,6 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
|
||||
)
|
||||
class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
|
||||
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
|
||||
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
|
||||
_tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
|
||||
|
||||
def __init__(self, config: DeformableDetrConfig):
|
||||
|
@ -1775,7 +1775,6 @@ class DetaModel(DetaPreTrainedModel):
|
||||
)
|
||||
class DetaForObjectDetection(DetaPreTrainedModel):
|
||||
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
|
||||
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
|
||||
_tied_weights_keys = [r"bbox_embed\.\d+"]
|
||||
|
||||
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
|
||||
|
@ -595,7 +595,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
||||
DISTILBERT_START_DOCSTRING,
|
||||
)
|
||||
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
|
||||
_tied_weights_keys = ["vocab_projector.weight"]
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
|
@ -296,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "ctx_encoder"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
|
||||
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
|
||||
@ -309,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "question_encoder"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
|
||||
class DPRPretrainedReader(DPRPreTrainedModel):
|
||||
@ -322,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "span_predictor"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
|
||||
###############
|
||||
|
@ -161,7 +161,9 @@ class ElectraEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_electra
|
||||
base_model_prefix = "electra"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
|
||||
def _init_weights(self, module):
|
||||
@ -1166,7 +1166,6 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
||||
ELECTRA_START_DOCSTRING,
|
||||
)
|
||||
class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
|
||||
_tied_weights_keys = ["generator_lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1534,7 +1533,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
"""ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
|
||||
)
|
||||
class ElectraForCausalLM(ElectraPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
|
||||
_tied_weights_keys = ["generator_lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -89,7 +89,9 @@ class ErnieEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel):
|
||||
config_class = ErnieConfig
|
||||
base_model_prefix = "ernie"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -983,7 +984,6 @@ class ErnieModel(ErniePreTrainedModel):
|
||||
ERNIE_START_DOCSTRING,
|
||||
)
|
||||
class ErnieForPreTraining(ErniePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
|
||||
@ -1095,8 +1095,6 @@ class ErnieForPreTraining(ErniePreTrainedModel):
|
||||
"""Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
|
||||
)
|
||||
class ErnieForCausalLM(ErniePreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
|
||||
@ -1243,8 +1241,6 @@ class ErnieForCausalLM(ErniePreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
|
||||
class ErnieForMaskedLM(ErniePreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
|
||||
@ -1665,8 +1661,6 @@ class ErnieForMultipleChoice(ErniePreTrainedModel):
|
||||
ERNIE_START_DOCSTRING,
|
||||
)
|
||||
class ErnieForTokenClassification(ErniePreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1746,8 +1740,6 @@ class ErnieForTokenClassification(ErniePreTrainedModel):
|
||||
ERNIE_START_DOCSTRING,
|
||||
)
|
||||
class ErnieForQuestionAnswering(ErniePreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -412,7 +412,6 @@ class ErnieMPreTrainedModel(PreTrainedModel):
|
||||
config_class = ErnieMConfig
|
||||
base_model_prefix = "ernie_m"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -96,7 +96,7 @@ class RotaryEmbedding(torch.nn.Module):
|
||||
# Generate and save the inverse frequency buffer (non trainable)
|
||||
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
|
||||
inv_freq = inv_freq
|
||||
self.register_buffer("inv_freq", inv_freq)
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
self._seq_len_cached = None
|
||||
self._cos_cached = None
|
||||
@ -178,7 +178,9 @@ class EsmEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.position_embeddings = nn.Embedding(
|
||||
@ -783,7 +785,6 @@ class EsmModel(EsmPreTrainedModel):
|
||||
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
supports_gradient_checkpointing = False
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
@ -960,8 +961,6 @@ class EsmModel(EsmPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING)
|
||||
class EsmForMaskedLM(EsmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1081,8 +1080,6 @@ class EsmLMHead(nn.Module):
|
||||
ESM_START_DOCSTRING,
|
||||
)
|
||||
class EsmForSequenceClassification(EsmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1177,9 +1174,6 @@ class EsmForSequenceClassification(EsmPreTrainedModel):
|
||||
ESM_START_DOCSTRING,
|
||||
)
|
||||
class EsmForTokenClassification(EsmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -378,8 +378,6 @@ class FlaubertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
class FlaubertModel(FlaubertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config): # , dico, is_encoder, with_output):
|
||||
super().__init__(config)
|
||||
|
||||
@ -448,7 +446,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
|
||||
self.layerdrop = getattr(config, "layerdrop", 0.0)
|
||||
self.pre_norm = getattr(config, "pre_norm", False)
|
||||
@ -654,7 +651,6 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
)
|
||||
# Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
|
||||
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
|
||||
_tied_weights_keys = ["pred_layer.proj.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -387,7 +387,9 @@ class FlavaTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -1724,12 +1726,6 @@ class FlavaGlobalContrastiveHead(nn.Module):
|
||||
)
|
||||
class FlavaForPreTraining(FlavaPreTrainedModel):
|
||||
# Those are linked to xxx.bias
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"mmm_text_head.decoder.bias",
|
||||
"mmm_image_head.decoder.bias",
|
||||
"mlm_head.decoder.bias",
|
||||
"mim_head.decoder.bias",
|
||||
]
|
||||
_tied_weights_keys = [
|
||||
"mmm_text_head.decoder.bias",
|
||||
"mmm_image_head.decoder.bias",
|
||||
|
@ -114,7 +114,9 @@ class FNetEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
@ -411,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel):
|
||||
config_class = FNetConfig
|
||||
base_model_prefix = "fnet"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -621,7 +622,6 @@ class FNetModel(FNetPreTrainedModel):
|
||||
FNET_START_DOCSTRING,
|
||||
)
|
||||
class FNetForPreTraining(FNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -716,7 +716,6 @@ class FNetForPreTraining(FNetPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING)
|
||||
class FNetForMaskedLM(FNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1034,7 +1034,6 @@ def _get_shape(t):
|
||||
FSMT_START_DOCSTRING,
|
||||
)
|
||||
class FSMTModel(PretrainedFSMTModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: FSMTConfig):
|
||||
@ -1172,15 +1171,6 @@ class FSMTModel(PretrainedFSMTModel):
|
||||
)
|
||||
class FSMTForConditionalGeneration(PretrainedFSMTModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
"decoder.output_projection.weight",
|
||||
]
|
||||
_keys_to_ignore_on_save = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
_tied_weights_keys = ["model.decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: FSMTConfig):
|
||||
|
@ -1190,7 +1190,6 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
|
||||
class FunnelForMaskedLM(FunnelPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: FunnelConfig) -> None:
|
||||
|
@ -109,7 +109,9 @@ class GitEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel):
|
||||
config_class = GitConfig
|
||||
base_model_prefix = "git"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -623,7 +624,7 @@ class GitVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
|
@ -668,9 +668,6 @@ DEPARALLELIZE_DOCSTRING = r"""
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2Model(GPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -957,8 +954,6 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1151,8 +1146,6 @@ input sequence).
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1381,9 +1374,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1605,9 +1595,6 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -500,8 +500,6 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
|
||||
GPT_BIGCODE_START_DOCSTRING,
|
||||
)
|
||||
class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.multi_query = config.multi_query
|
||||
@ -722,7 +720,6 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
||||
GPT_BIGCODE_START_DOCSTRING,
|
||||
)
|
||||
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -876,8 +873,6 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
|
||||
GPT_BIGCODE_START_DOCSTRING,
|
||||
)
|
||||
class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -145,8 +145,8 @@ class GPTNeoSelfAttention(nn.Module):
|
||||
if attention_type == "local":
|
||||
bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
|
||||
|
||||
self.register_buffer("bias", bias)
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e9))
|
||||
self.register_buffer("bias", bias, persistent=False)
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
|
||||
|
||||
self.attn_dropout = nn.Dropout(float(config.attention_dropout))
|
||||
self.resid_dropout = nn.Dropout(float(config.resid_dropout))
|
||||
@ -663,12 +663,6 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"h\.\d+\.attn\.masked_bias",
|
||||
r"lm_head.weight",
|
||||
r"h\.\d+\.attn\.attention\.bias",
|
||||
]
|
||||
_keys_to_ignore_on_save = [r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -820,8 +814,6 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1025,8 +1017,6 @@ class GPTNeoForTokenClassification(GPTNeoPreTrainedModel):
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -100,8 +100,9 @@ class GPTNeoXAttention(nn.Module):
|
||||
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
||||
1, 1, max_positions, max_positions
|
||||
),
|
||||
persistent=False,
|
||||
)
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e9))
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
|
||||
self.rotary_emb = RotaryEmbedding(
|
||||
self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
|
||||
)
|
||||
@ -600,7 +601,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
|
||||
"""GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
|
||||
)
|
||||
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["embed_out.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -775,8 +775,6 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
|
||||
GPT_NEOX_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -971,8 +969,6 @@ class GPTNeoXForTokenClassification(GPTNeoXPreTrainedModel):
|
||||
GPT_NEOX_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -591,7 +591,6 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
|
||||
GPT_NEOX_JAPANESE_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
|
||||
_tied_weights_keys = ["embed_out.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -734,7 +734,6 @@ class GPTJModel(GPTJPreTrainedModel):
|
||||
GPTJ_START_DOCSTRING,
|
||||
)
|
||||
class GPTJForCausalLM(GPTJPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -933,8 +932,6 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
|
||||
GPTJ_START_DOCSTRING,
|
||||
)
|
||||
class GPTJForSequenceClassification(GPTJPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1059,8 +1056,6 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
|
||||
GPTJ_START_DOCSTRING,
|
||||
)
|
||||
class GPTJForQuestionAnswering(GPTJPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -1111,7 +1111,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
|
||||
GPTSAN_JAPANESE_START_DOCSTRING,
|
||||
)
|
||||
class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: GPTSanJapaneseConfig):
|
||||
|
@ -714,7 +714,6 @@ class GraphormerPreTrainedModel(PreTrainedModel):
|
||||
config_class = GraphormerConfig
|
||||
base_model_prefix = "graphormer"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
main_input_name_nodes = "input_nodes"
|
||||
main_input_name_edges = "input_edges"
|
||||
|
||||
|
@ -450,7 +450,9 @@ class GroupViTTextEmbeddings(nn.Module):
|
||||
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -767,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel):
|
||||
config_class = GroupViTConfig
|
||||
base_model_prefix = "groupvit"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -869,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel):
|
||||
base_model_prefix = "hubert"
|
||||
main_input_name = "input_values"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
@ -80,7 +80,9 @@ class IBertEmbeddings(nn.Module):
|
||||
)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
# End copy
|
||||
@ -740,8 +742,6 @@ class IBertModel(IBertPreTrainedModel):
|
||||
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -854,8 +854,6 @@ class IBertModel(IBertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING)
|
||||
class IBertForMaskedLM(IBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -969,8 +967,6 @@ class IBertLMHead(nn.Module):
|
||||
IBERT_START_DOCSTRING,
|
||||
)
|
||||
class IBertForSequenceClassification(IBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1064,8 +1060,6 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
|
||||
IBERT_START_DOCSTRING,
|
||||
)
|
||||
class IBertForMultipleChoice(IBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1156,9 +1150,6 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
|
||||
IBERT_START_DOCSTRING,
|
||||
)
|
||||
class IBertForTokenClassification(IBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1256,9 +1247,6 @@ class IBertClassificationHead(nn.Module):
|
||||
IBERT_START_DOCSTRING,
|
||||
)
|
||||
class IBertForQuestionAnswering(IBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -183,8 +183,9 @@ class ImageGPTAttention(nn.Module):
|
||||
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
||||
1, 1, max_positions, max_positions
|
||||
),
|
||||
persistent=False,
|
||||
)
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e4))
|
||||
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
|
||||
|
||||
self.embed_dim = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
@ -613,8 +614,6 @@ IMAGEGPT_INPUTS_DOCSTRING = r"""
|
||||
IMAGEGPT_START_DOCSTRING,
|
||||
)
|
||||
class ImageGPTModel(ImageGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["attn.masked_bias"]
|
||||
|
||||
def __init__(self, config: ImageGPTConfig):
|
||||
super().__init__(config)
|
||||
|
||||
@ -893,7 +892,6 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
|
||||
IMAGEGPT_START_DOCSTRING,
|
||||
)
|
||||
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ImageGPTConfig):
|
||||
@ -1085,8 +1083,6 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
|
||||
IMAGEGPT_START_DOCSTRING,
|
||||
)
|
||||
class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ImageGPTConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -602,7 +602,6 @@ Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/
|
||||
class JukeboxVQVAE(PreTrainedModel):
|
||||
config_class = JukeboxVQVAEConfig
|
||||
base_model_prefix = "vqvae"
|
||||
_keys_to_ignore_on_load_unexpected = [r"priors"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
if isinstance(module, nn.Embedding): # embed_tokens
|
||||
@ -1792,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel):
|
||||
"""
|
||||
|
||||
config_class = JukeboxPriorConfig
|
||||
_keys_to_ignore_on_load_unexpected = ["vqvae"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
init_scale = self.config.init_scale
|
||||
@ -1832,7 +1830,6 @@ class JukeboxPrior(PreTrainedModel):
|
||||
self.level = level if level is not None else config.level
|
||||
|
||||
self.base_model_prefix = f"priors.{self.level}"
|
||||
self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
|
||||
|
||||
self.n_ctx = config.n_ctx
|
||||
|
||||
|
@ -68,7 +68,9 @@ class LayoutLMEmbeddings(nn.Module):
|
||||
self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -619,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
|
||||
pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
base_model_prefix = "layoutlm"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -857,11 +858,6 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
|
||||
class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"cls.predictions.decoder.bias",
|
||||
"cls.predictions.decoder.weight",
|
||||
"embeddings.position_ids",
|
||||
]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -77,7 +77,9 @@ class LayoutLMv2Embeddings(nn.Module):
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def _calc_spatial_position_embeddings(self, bbox):
|
||||
try:
|
||||
@ -506,7 +508,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
|
||||
config_class = LayoutLMv2Config
|
||||
pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
base_model_prefix = "layoutlmv2"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -567,8 +568,11 @@ class LayoutLMv2VisualBackbone(nn.Module):
|
||||
self.register_buffer(
|
||||
"pixel_mean",
|
||||
torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
|
||||
persistent=False,
|
||||
)
|
||||
self.register_buffer(
|
||||
"pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
|
||||
)
|
||||
self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1))
|
||||
self.out_feature_key = "p2"
|
||||
if torch.are_deterministic_algorithms_enabled():
|
||||
logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
|
||||
|
@ -245,7 +245,9 @@ class LayoutLMv3TextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.position_embeddings = nn.Embedding(
|
||||
@ -750,8 +752,6 @@ class LayoutLMv3Output(nn.Module):
|
||||
LAYOUTLMV3_START_DOCSTRING,
|
||||
)
|
||||
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -1038,9 +1038,6 @@ class LayoutLMv3ClassificationHead(nn.Module):
|
||||
LAYOUTLMV3_START_DOCSTRING,
|
||||
)
|
||||
class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1153,9 +1150,6 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
|
||||
LAYOUTLMV3_START_DOCSTRING,
|
||||
)
|
||||
class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1286,8 +1280,6 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
|
||||
LAYOUTLMV3_START_DOCSTRING,
|
||||
)
|
||||
class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -2209,7 +2209,6 @@ class LEDDecoder(LEDPreTrainedModel):
|
||||
LED_START_DOCSTRING,
|
||||
)
|
||||
class LEDModel(LEDPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: LEDConfig):
|
||||
@ -2335,14 +2334,7 @@ class LEDModel(LEDPreTrainedModel):
|
||||
)
|
||||
class LEDForConditionalGeneration(LEDPreTrainedModel):
|
||||
base_model_prefix = "led"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: LEDConfig):
|
||||
@ -2530,7 +2522,6 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
|
||||
LED_START_DOCSTRING,
|
||||
)
|
||||
class LEDForSequenceClassification(LEDPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: LEDConfig, **kwargs):
|
||||
@ -2667,7 +2658,6 @@ class LEDForSequenceClassification(LEDPreTrainedModel):
|
||||
LED_START_DOCSTRING,
|
||||
)
|
||||
class LEDForQuestionAnswering(LEDPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -195,7 +195,9 @@ class LevitAttention(nn.Module):
|
||||
|
||||
self.attention_bias_cache = {}
|
||||
self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
|
||||
self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points))
|
||||
self.register_buffer(
|
||||
"attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def train(self, mode=True):
|
||||
@ -271,7 +273,9 @@ class LevitAttentionSubsample(nn.Module):
|
||||
indices.append(attention_offsets[offset])
|
||||
|
||||
self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
|
||||
self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
|
||||
self.register_buffer(
|
||||
"attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def train(self, mode=True):
|
||||
|
@ -59,7 +59,9 @@ class LiltTextEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
# End copy
|
||||
@ -610,15 +612,6 @@ class LiltPreTrainedModel(PreTrainedModel):
|
||||
if isinstance(module, LiltEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
def update_keys_to_ignore(self, config, del_keys_to_ignore):
|
||||
"""Remove some keys from ignore list"""
|
||||
if not config.tie_word_embeddings:
|
||||
# must make a new list, or the class variable gets modified!
|
||||
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
|
||||
self._keys_to_ignore_on_load_missing = [
|
||||
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
|
||||
]
|
||||
|
||||
|
||||
LILT_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
@ -697,8 +690,6 @@ LILT_INPUTS_DOCSTRING = r"""
|
||||
LILT_START_DOCSTRING,
|
||||
)
|
||||
class LiltModel(LiltPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -847,8 +838,6 @@ class LiltModel(LiltPreTrainedModel):
|
||||
LILT_START_DOCSTRING,
|
||||
)
|
||||
class LiltForSequenceClassification(LiltPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -967,9 +956,6 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
|
||||
LILT_START_DOCSTRING,
|
||||
)
|
||||
class LiltForTokenClassification(LiltPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1096,9 +1082,6 @@ class LiltClassificationHead(nn.Module):
|
||||
LILT_START_DOCSTRING,
|
||||
)
|
||||
class LiltForQuestionAnswering(LiltPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -344,7 +344,6 @@ class LlamaPreTrainedModel(PreTrainedModel):
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["LlamaDecoderLayer"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = self.config.initializer_range
|
||||
@ -784,8 +783,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
LLAMA_START_DOCSTRING,
|
||||
)
|
||||
class LlamaForSequenceClassification(LlamaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -1421,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel):
|
||||
config_class = LongformerConfig
|
||||
base_model_prefix = "longformer"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_unexpected = [r"position_ids"]
|
||||
_no_split_modules = ["LongformerSelfAttention"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
@ -1770,8 +1769,6 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING)
|
||||
class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.decoder"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1886,8 +1883,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
LONGFORMER_START_DOCSTRING,
|
||||
)
|
||||
class LongformerForSequenceClassification(LongformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -2015,8 +2010,6 @@ class LongformerClassificationHead(nn.Module):
|
||||
LONGFORMER_START_DOCSTRING,
|
||||
)
|
||||
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -2154,8 +2147,6 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
||||
LONGFORMER_START_DOCSTRING,
|
||||
)
|
||||
class LongformerForTokenClassification(LongformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -1763,10 +1763,6 @@ num_heads)`.
|
||||
LONGT5_START_DOCSTRING,
|
||||
)
|
||||
class LongT5Model(LongT5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
@ -1917,11 +1913,6 @@ class LongT5Model(LongT5PreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
|
||||
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
r"lm_head.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
@ -2160,7 +2151,6 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
|
||||
LONGT5_START_DOCSTRING,
|
||||
)
|
||||
class LongT5EncoderModel(LongT5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: LongT5Config):
|
||||
|
@ -1022,8 +1022,6 @@ LUKE_INPUTS_DOCSTRING = r"""
|
||||
LUKE_START_DOCSTRING,
|
||||
)
|
||||
class LukeModel(LukePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -1278,17 +1276,6 @@ class LukeLMHead(nn.Module):
|
||||
LUKE_START_DOCSTRING,
|
||||
)
|
||||
class LukeForMaskedLM(LukePreTrainedModel):
|
||||
_keys_to_ignore_on_save = [
|
||||
r"lm_head.decoder.weight",
|
||||
r"lm_head.decoder.bias",
|
||||
r"entity_predictions.decoder.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"position_ids",
|
||||
r"lm_head.decoder.weight",
|
||||
r"lm_head.decoder.bias",
|
||||
r"entity_predictions.decoder.weight",
|
||||
]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1018,7 +1018,6 @@ class LxmertModel(LxmertPreTrainedModel):
|
||||
LXMERT_START_DOCSTRING,
|
||||
)
|
||||
class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -131,7 +131,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
|
||||
# in forward put the weights on the correct dtype and device of the param
|
||||
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
|
||||
|
||||
self.register_buffer("weights", emb_weights)
|
||||
self.register_buffer("weights", emb_weights, persistent=False)
|
||||
|
||||
@staticmethod
|
||||
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
||||
@ -1137,14 +1137,6 @@ class M2M100Decoder(M2M100PreTrainedModel):
|
||||
M2M_100_START_DOCSTRING,
|
||||
)
|
||||
class M2M100Model(M2M100PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_positions.weights",
|
||||
"encoder.embed_positions.bias",
|
||||
"decoder.embed_positions.weights",
|
||||
"decoder.embed_positions.bias",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: M2M100Config):
|
||||
@ -1258,17 +1250,6 @@ class M2M100Model(M2M100PreTrainedModel):
|
||||
)
|
||||
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
r"encoder.embed_positions.weights",
|
||||
r"encoder.embed_positions.bias",
|
||||
r"decoder.embed_positions.weights",
|
||||
r"decoder.embed_positions.bias",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: M2M100Config):
|
||||
|
@ -1103,7 +1103,6 @@ class MarianDecoder(MarianPreTrainedModel):
|
||||
"The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING
|
||||
)
|
||||
class MarianModel(MarianPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: MarianConfig):
|
||||
@ -1292,13 +1291,9 @@ class MarianModel(MarianPreTrainedModel):
|
||||
class MarianMTModel(MarianPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
r"embed_positions",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"final_logits_bias",
|
||||
"encoder.embed_positions.weight",
|
||||
"decoder.embed_positions.weight",
|
||||
]
|
||||
_keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"]
|
||||
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1561,7 +1556,6 @@ class MarianDecoderWrapper(MarianPreTrainedModel):
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
|
||||
class MarianForCausalLM(MarianPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -143,7 +143,9 @@ class MarkupLMEmbeddings(nn.Module):
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.position_embeddings = nn.Embedding(
|
||||
@ -713,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
|
||||
config_class = MarkupLMConfig
|
||||
pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
base_model_prefix = "markuplm"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM
|
||||
def _init_weights(self, module):
|
||||
@ -971,8 +972,6 @@ class MarkupLMModel(MarkupLMPreTrainedModel):
|
||||
MARKUPLM_START_DOCSTRING,
|
||||
)
|
||||
class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -1156,7 +1156,6 @@ class MBartDecoder(MBartPreTrainedModel):
|
||||
MBART_START_DOCSTRING,
|
||||
)
|
||||
class MBartModel(MBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: MBartConfig):
|
||||
@ -1277,14 +1276,7 @@ class MBartModel(MBartPreTrainedModel):
|
||||
)
|
||||
class MBartForConditionalGeneration(MBartPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: MBartConfig):
|
||||
@ -1452,7 +1444,6 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
|
||||
MBART_START_DOCSTRING,
|
||||
)
|
||||
class MBartForSequenceClassification(MBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: MBartConfig, **kwargs):
|
||||
@ -1582,7 +1573,6 @@ class MBartForSequenceClassification(MBartPreTrainedModel):
|
||||
MBART_START_DOCSTRING,
|
||||
)
|
||||
class MBartForQuestionAnswering(MBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1716,7 +1706,6 @@ class MBartDecoderWrapper(MBartPreTrainedModel):
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
|
||||
class MBartForCausalLM(MBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -149,7 +149,9 @@ class MCTCTEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids",
|
||||
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
|
||||
@ -443,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel):
|
||||
config_class = MCTCTConfig
|
||||
base_model_prefix = "mctct"
|
||||
main_input_name = "input_features"
|
||||
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
|
@ -1387,15 +1387,6 @@ class MegaPreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
def update_keys_to_ignore(self, config, del_keys_to_ignore):
|
||||
"""Remove some keys from ignore list"""
|
||||
if not config.tie_word_embeddings:
|
||||
# must make a new list, or the class variable gets modified!
|
||||
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
|
||||
self._keys_to_ignore_on_load_missing = [
|
||||
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
|
||||
]
|
||||
|
||||
|
||||
MEGA_START_DOCSTRING = r"""
|
||||
|
||||
@ -1474,8 +1465,6 @@ class MegaModel(MegaPreTrainedModel):
|
||||
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_missing = []
|
||||
|
||||
def __init__(self, config: MegaConfig, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -1656,9 +1645,6 @@ class MegaModel(MegaPreTrainedModel):
|
||||
"""MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
|
||||
)
|
||||
class MegaForCausalLM(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: MegaConfig):
|
||||
@ -1678,9 +1664,6 @@ class MegaForCausalLM(MegaPreTrainedModel):
|
||||
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["lm_head.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -1821,9 +1804,6 @@ class MegaForCausalLM(MegaPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
|
||||
class MegaForMaskedLM(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["mlm_head.weight"]
|
||||
|
||||
def __init__(self, config: MegaConfig):
|
||||
@ -1845,9 +1825,6 @@ class MegaForMaskedLM(MegaPreTrainedModel):
|
||||
self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
|
||||
self.dropout = nn.Dropout(config.dropout_prob)
|
||||
|
||||
# The LM head weights require special treatment only when they are tied with the word embeddings
|
||||
self.update_keys_to_ignore(config, ["mlm_head.weight"])
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -1931,8 +1908,6 @@ class MegaForMaskedLM(MegaPreTrainedModel):
|
||||
MEGA_START_DOCSTRING,
|
||||
)
|
||||
class MegaForSequenceClassification(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = []
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -2024,8 +1999,6 @@ class MegaForSequenceClassification(MegaPreTrainedModel):
|
||||
MEGA_START_DOCSTRING,
|
||||
)
|
||||
class MegaForMultipleChoice(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = []
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -2111,9 +2084,6 @@ class MegaForMultipleChoice(MegaPreTrainedModel):
|
||||
MEGA_START_DOCSTRING,
|
||||
)
|
||||
class MegaForTokenClassification(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = []
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -2214,9 +2184,6 @@ class MegaClassificationHead(nn.Module):
|
||||
MEGA_START_DOCSTRING,
|
||||
)
|
||||
class MegaForQuestionAnswering(MegaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = []
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -149,7 +149,9 @@ class MegatronBertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
def forward(
|
||||
@ -713,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_megatron_bert
|
||||
base_model_prefix = "bert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1014,7 +1015,6 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config, add_binary_head=True):
|
||||
@ -1121,8 +1121,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1267,8 +1265,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING)
|
||||
class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1376,8 +1372,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"predictions"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1672,8 +1666,6 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1752,8 +1744,6 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -191,7 +191,9 @@ class MobileBertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -686,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel):
|
||||
pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
load_tf_weights = load_tf_weights_in_mobilebert
|
||||
base_model_prefix = "mobilebert"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -923,11 +924,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
||||
MOBILEBERT_START_DOCSTRING,
|
||||
)
|
||||
class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"cls.predictions.decoder.weight",
|
||||
"cls.predictions.decoder.bias",
|
||||
"embeddings.position_ids",
|
||||
]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1036,12 +1032,6 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
|
||||
class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"cls.predictions.decoder.weight",
|
||||
"cls.predictions.decoder.bias",
|
||||
"embeddings.position_ids",
|
||||
]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1350,8 +1340,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing
|
||||
class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1553,8 +1541,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing
|
||||
class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -83,7 +83,9 @@ class MPNetEmbeddings(nn.Module):
|
||||
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
|
||||
if position_ids is None:
|
||||
@ -479,8 +481,6 @@ MPNET_INPUTS_DOCSTRING = r"""
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class MPNetModel(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
@ -570,8 +570,6 @@ class MPNetModel(MPNetPreTrainedModel):
|
||||
|
||||
|
||||
class MPNetForMaskedLM(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_tied_weights_keys = ["lm_head.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -679,8 +677,6 @@ class MPNetLMHead(nn.Module):
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class MPNetForSequenceClassification(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -773,8 +769,6 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel):
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class MPNetForMultipleChoice(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -863,9 +857,6 @@ class MPNetForMultipleChoice(MPNetPreTrainedModel):
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class MPNetForTokenClassification(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -962,9 +953,6 @@ class MPNetClassificationHead(nn.Module):
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class MPNetForQuestionAnswering(MPNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
|
@ -1316,18 +1316,8 @@ class MT5Model(MT5PreTrainedModel):
|
||||
```"""
|
||||
model_type = "mt5"
|
||||
config_class = MT5Config
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
_keys_to_ignore_on_save = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
|
||||
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
|
||||
@ -1552,15 +1542,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
|
||||
|
||||
model_type = "mt5"
|
||||
config_class = MT5Config
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_save = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
|
||||
@ -1897,13 +1879,6 @@ class MT5EncoderModel(MT5PreTrainedModel):
|
||||
|
||||
model_type = "mt5"
|
||||
config_class = MT5Config
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_save = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight"]
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5
|
||||
@ -2029,14 +2004,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
|
||||
MT5_START_DOCSTRING,
|
||||
)
|
||||
class MT5ForQuestionAnswering(MT5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
r"lm_head.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
|
||||
|
@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel):
|
||||
config_class = MvpConfig
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = self.config.init_std
|
||||
@ -1300,8 +1299,7 @@ class MvpDecoder(MvpPreTrainedModel):
|
||||
MVP_START_DOCSTRING,
|
||||
)
|
||||
class MvpModel(MvpPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_keys_to_ignore_on_load_unexpected = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: MvpConfig):
|
||||
@ -1438,7 +1436,6 @@ class MvpModel(MvpPreTrainedModel):
|
||||
"The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
|
||||
)
|
||||
class MvpForConditionalGeneration(MvpPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: MvpConfig):
|
||||
@ -1611,8 +1608,6 @@ class MvpForConditionalGeneration(MvpPreTrainedModel):
|
||||
MVP_START_DOCSTRING,
|
||||
)
|
||||
class MvpForSequenceClassification(MvpPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: MvpConfig, **kwargs):
|
||||
@ -1740,8 +1735,6 @@ class MvpForSequenceClassification(MvpPreTrainedModel):
|
||||
MVP_START_DOCSTRING,
|
||||
)
|
||||
class MvpForQuestionAnswering(MvpPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1873,7 +1866,6 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
|
||||
|
||||
|
||||
class MvpForCausalLM(MvpPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -163,7 +163,7 @@ class NezhaRelativePositionsEncoding(nn.Module):
|
||||
my_shape = list(final_mat.size())
|
||||
my_shape.append(depth)
|
||||
positions_encoding = positions_encoding.view(my_shape)
|
||||
self.register_buffer("positions_encoding", positions_encoding)
|
||||
self.register_buffer("positions_encoding", positions_encoding, persistent=False)
|
||||
|
||||
def forward(self, length):
|
||||
return self.positions_encoding[:length, :length, :]
|
||||
@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_nezha
|
||||
base_model_prefix = "nezha"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"positions_encoding"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1037,7 +1036,6 @@ class NezhaModel(NezhaPreTrainedModel):
|
||||
NEZHA_START_DOCSTRING,
|
||||
)
|
||||
class NezhaForPreTraining(NezhaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1140,8 +1138,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
|
||||
class NezhaForMaskedLM(NezhaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1542,8 +1538,6 @@ class NezhaForMultipleChoice(NezhaPreTrainedModel):
|
||||
NEZHA_START_DOCSTRING,
|
||||
)
|
||||
class NezhaForTokenClassification(NezhaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1623,8 +1617,6 @@ class NezhaForTokenClassification(NezhaPreTrainedModel):
|
||||
NEZHA_START_DOCSTRING,
|
||||
)
|
||||
class NezhaForQuestionAnswering(NezhaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -183,7 +183,7 @@ class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
|
||||
# in forward put the weights on the correct dtype and device of the param
|
||||
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
|
||||
|
||||
self.register_buffer("weights", emb_weights)
|
||||
self.register_buffer("weights", emb_weights, persistent=False)
|
||||
|
||||
@staticmethod
|
||||
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
||||
@ -1500,14 +1500,6 @@ class NllbMoeDecoder(NllbMoePreTrainedModel):
|
||||
NLLB_MOE_START_DOCSTRING,
|
||||
)
|
||||
class NllbMoeModel(NllbMoePreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_positions.weights",
|
||||
"encoder.embed_positions.bias",
|
||||
"decoder.embed_positions.weights",
|
||||
"decoder.embed_positions.bias",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: NllbMoeConfig):
|
||||
@ -1641,17 +1633,6 @@ class NllbMoeModel(NllbMoePreTrainedModel):
|
||||
)
|
||||
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
r"encoder.embed_positions.weights",
|
||||
r"encoder.embed_positions.bias",
|
||||
r"decoder.embed_positions.weights",
|
||||
r"decoder.embed_positions.bias",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: NllbMoeConfig):
|
||||
|
@ -64,7 +64,9 @@ class NystromformerEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
|
||||
)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer(
|
||||
"token_type_ids",
|
||||
@ -458,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel):
|
||||
config_class = NystromformerConfig
|
||||
base_model_prefix = "nystromformer"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -658,7 +659,6 @@ class NystromformerModel(NystromformerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING)
|
||||
class NystromformerForMaskedLM(NystromformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -368,7 +368,6 @@ class OpenLlamaPreTrainedModel(PreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["OpenLlamaDecoderLayer"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = self.config.initializer_range
|
||||
@ -825,8 +824,6 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
|
||||
)
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama
|
||||
class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -141,7 +141,9 @@ class Attention(nn.Module):
|
||||
if n_state % config.n_head != 0:
|
||||
raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
|
||||
self.register_buffer(
|
||||
"bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
|
||||
"bias",
|
||||
torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions),
|
||||
persistent=False,
|
||||
)
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
@ -274,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||
config_class = OpenAIGPTConfig
|
||||
load_tf_weights = load_tf_weights_in_openai_gpt
|
||||
base_model_prefix = "transformer"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights."""
|
||||
@ -407,7 +408,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
|
||||
|
||||
self.register_buffer("position_ids", torch.arange(config.n_positions))
|
||||
self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False)
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@ -529,7 +530,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
OPENAI_GPT_START_DOCSTRING,
|
||||
)
|
||||
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -621,7 +621,6 @@ input sequence).
|
||||
OPENAI_GPT_START_DOCSTRING,
|
||||
)
|
||||
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -399,7 +399,6 @@ class OPTPreTrainedModel(PreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["OPTDecoderLayer"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = self.config.init_std
|
||||
@ -817,7 +816,6 @@ class OPTModel(OPTPreTrainedModel):
|
||||
|
||||
|
||||
class OPTForCausalLM(OPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1025,8 +1023,6 @@ class OPTForCausalLM(OPTPreTrainedModel):
|
||||
OPT_START_DOCSTRING,
|
||||
)
|
||||
class OPTForSequenceClassification(OPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config: OPTConfig):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1147,8 +1143,6 @@ class OPTForSequenceClassification(OPTPreTrainedModel):
|
||||
OPT_START_DOCSTRING,
|
||||
)
|
||||
class OPTForQuestionAnswering(OPTPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config: OPTConfig):
|
||||
super().__init__(config)
|
||||
self.model = OPTModel(config)
|
||||
|
@ -304,7 +304,7 @@ class OwlViTVisionEmbeddings(nn.Module):
|
||||
self.num_patches = (config.image_size // config.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
|
||||
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
batch_size = pixel_values.shape[0]
|
||||
@ -325,7 +325,9 @@ class OwlViTTextEmbeddings(nn.Module):
|
||||
self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel):
|
||||
config_class = OwlViTConfig
|
||||
base_model_prefix = "owlvit"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_no_split_modules = ["OwlViTEncoderLayer"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
|
@ -1156,7 +1156,6 @@ class PegasusDecoder(PegasusPreTrainedModel):
|
||||
PEGASUS_START_DOCSTRING,
|
||||
)
|
||||
class PegasusModel(PegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: PegasusConfig):
|
||||
@ -1309,15 +1308,7 @@ class PegasusModel(PegasusPreTrainedModel):
|
||||
)
|
||||
class PegasusForConditionalGeneration(PegasusPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
r"embed_positions.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: PegasusConfig):
|
||||
@ -1518,7 +1509,6 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
|
||||
|
||||
|
||||
class PegasusForCausalLM(PegasusPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1391,7 +1391,6 @@ class PegasusXDecoder(PegasusXPreTrainedModel):
|
||||
PEGASUS_X_START_DOCSTRING,
|
||||
)
|
||||
class PegasusXModel(PegasusXPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: PegasusXConfig):
|
||||
@ -1536,14 +1535,6 @@ class PegasusXModel(PegasusXPreTrainedModel):
|
||||
@add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING)
|
||||
class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
r"embed_positions.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: PegasusXConfig):
|
||||
|
@ -1597,14 +1597,6 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel):
|
||||
class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
|
||||
config_class = Pix2StructConfig
|
||||
main_input_name = "flattened_patches"
|
||||
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder.embed_tokens.weight",
|
||||
r"decoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
_tied_weights_keys = ["decoder.lm_head.weight"]
|
||||
|
||||
def __init__(self, config: Pix2StructConfig):
|
||||
|
@ -1132,7 +1132,6 @@ class PLBartDecoder(PLBartPreTrainedModel):
|
||||
PLBART_START_DOCSTRING,
|
||||
)
|
||||
class PLBartModel(PLBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: PLBartConfig):
|
||||
@ -1251,14 +1250,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
||||
)
|
||||
class PLBartForConditionalGeneration(PLBartPreTrainedModel):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
r"encoder.version",
|
||||
r"decoder.version",
|
||||
r"lm_head.weight",
|
||||
"decoder.embed_tokens.weight",
|
||||
"encoder.embed_tokens.weight",
|
||||
]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: PLBartConfig):
|
||||
@ -1423,7 +1415,6 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel):
|
||||
PLBART_START_DOCSTRING,
|
||||
)
|
||||
class PLBartForSequenceClassification(PLBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
|
||||
|
||||
def __init__(self, config: PLBartConfig, **kwargs):
|
||||
@ -1562,7 +1553,6 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel):
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
|
||||
class PLBartForCausalLM(PLBartPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -1744,7 +1744,6 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
|
||||
PROPHETNET_START_DOCSTRING,
|
||||
)
|
||||
class ProphetNetModel(ProphetNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
|
||||
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
|
||||
|
||||
def __init__(self, config: ProphetNetConfig):
|
||||
@ -1874,11 +1873,6 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
|
||||
PROPHETNET_START_DOCSTRING,
|
||||
)
|
||||
class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"decoder.word_embeddings.weight",
|
||||
"encoder.word_embeddings.weight",
|
||||
"lm_head.weight",
|
||||
]
|
||||
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ProphetNetConfig):
|
||||
@ -2091,7 +2085,6 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
|
||||
PROPHETNET_START_DOCSTRING,
|
||||
)
|
||||
class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ProphetNetConfig):
|
||||
|
@ -164,7 +164,9 @@ class QDQBertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -738,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_qdqbert
|
||||
base_model_prefix = "bert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1012,8 +1013,6 @@ class QDQBertModel(QDQBertPreTrainedModel):
|
||||
"""QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
|
||||
)
|
||||
class QDQBertLMHeadModel(QDQBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1166,8 +1165,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
|
||||
class QDQBertForMaskedLM(QDQBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1570,8 +1567,6 @@ class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
|
||||
QDQBERT_START_DOCSTRING,
|
||||
)
|
||||
class QDQBertForTokenClassification(QDQBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
@ -1650,8 +1645,6 @@ class QDQBertForTokenClassification(QDQBertPreTrainedModel):
|
||||
QDQBERT_START_DOCSTRING,
|
||||
)
|
||||
class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
config_class = RagConfig
|
||||
base_model_prefix = "rag"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
|
@ -178,7 +178,9 @@ class RealmEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
self.register_buffer(
|
||||
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
||||
)
|
||||
@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel):
|
||||
config_class = RealmConfig
|
||||
load_tf_weights = load_tf_weights_in_realm
|
||||
base_model_prefix = "realm"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1147,7 +1148,6 @@ class RealmBertModel(RealmPreTrainedModel):
|
||||
REALM_START_DOCSTRING,
|
||||
)
|
||||
class RealmEmbedder(RealmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1378,7 +1378,6 @@ class RealmScorer(RealmPreTrainedModel):
|
||||
REALM_START_DOCSTRING,
|
||||
)
|
||||
class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1529,8 +1528,6 @@ class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
|
||||
class RealmReader(RealmPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -352,10 +352,10 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
||||
self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False)
|
||||
|
||||
# save mask value here. Need fp32 and fp16 mask values
|
||||
self.register_buffer("self_mask_value_float16", torch.tensor(-1e3))
|
||||
self.register_buffer("self_mask_value_float32", torch.tensor(-1e5))
|
||||
self.register_buffer("mask_value_float16", torch.tensor(-1e4))
|
||||
self.register_buffer("mask_value_float32", torch.tensor(-1e9))
|
||||
self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False)
|
||||
self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False)
|
||||
self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
|
||||
self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -1049,8 +1049,8 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
|
||||
self.dropout = config.local_attention_probs_dropout_prob
|
||||
|
||||
# save mask value here
|
||||
self.register_buffer("mask_value_float16", torch.tensor(-1e4))
|
||||
self.register_buffer("mask_value_float32", torch.tensor(-1e9))
|
||||
self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
|
||||
self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -2185,7 +2185,6 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
|
||||
@add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
|
||||
class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"]
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -158,7 +158,9 @@ class RemBertEmbeddings(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.register_buffer(
|
||||
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -654,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_rembert
|
||||
base_model_prefix = "rembert"
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -1016,7 +1017,6 @@ class RemBertForMaskedLM(RemBertPreTrainedModel):
|
||||
"""RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
|
||||
)
|
||||
class RemBertForCausalLM(RemBertPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user