mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Generation: deprecate PreTrainedModel
inheriting from GenerationMixin
(#33203)
This commit is contained in:
parent
1456120929
commit
e15687fffe
@ -34,13 +34,6 @@ from ..cache_utils import (
|
||||
)
|
||||
from ..integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
|
||||
from ..models.auto import (
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||
)
|
||||
from ..pytorch_utils import isin_mps_friendly
|
||||
from ..tokenization_utils import ExtensionsTrie
|
||||
from ..utils import (
|
||||
@ -1117,26 +1110,21 @@ class GenerationMixin:
|
||||
Confirms that the model class is compatible with generation. If not, raises an exception that points to the
|
||||
right class to use.
|
||||
"""
|
||||
# TODO(joao): remove this function in v4.50, i.e. when we remove the inheritance of `GenerationMixin` from
|
||||
# `PreTrainedModel`. With that inheritance removed, all model classes inheriting from `GenerationMixin` can
|
||||
# safely call `GenerationMixin.generate`
|
||||
if not is_torchdynamo_compiling() and not self.can_generate():
|
||||
generate_compatible_mappings = [
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
|
||||
MODEL_FOR_VISION_2_SEQ_MAPPING,
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
terminations_with_generation_support = [
|
||||
"ForCausalLM",
|
||||
"ForConditionalGeneration",
|
||||
"ForSpeechSeq2Seq",
|
||||
"ForVision2Seq",
|
||||
]
|
||||
generate_compatible_classes = set()
|
||||
for model_mapping in generate_compatible_mappings:
|
||||
supported_models = model_mapping.get(type(self.config), default=None)
|
||||
if supported_models is not None:
|
||||
generate_compatible_classes.add(supported_models.__name__)
|
||||
exception_message = (
|
||||
raise TypeError(
|
||||
f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
|
||||
"it doesn't have a language model head."
|
||||
"it doesn't have a language model head. Classes that support generation often end in one of these "
|
||||
f"names: {terminations_with_generation_support}."
|
||||
)
|
||||
if generate_compatible_classes:
|
||||
exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
|
||||
raise TypeError(exception_message)
|
||||
|
||||
def _validate_assistant(self, assistant_model):
|
||||
if assistant_model is None:
|
||||
|
@ -212,7 +212,7 @@ def no_init_weights(_enable=True):
|
||||
setattr(torch.nn.init, name, init_func)
|
||||
|
||||
|
||||
def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
|
||||
def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
|
||||
try:
|
||||
return next(parameter.parameters()).device
|
||||
except StopIteration:
|
||||
@ -227,7 +227,7 @@ def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUti
|
||||
return first_tuple[1].device
|
||||
|
||||
|
||||
def get_first_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
|
||||
def get_first_parameter_dtype(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
|
||||
"""
|
||||
Returns the first parameter dtype (can be non-floating) or asserts if none were found.
|
||||
"""
|
||||
@ -245,7 +245,7 @@ def get_first_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "Modu
|
||||
return first_tuple[1].dtype
|
||||
|
||||
|
||||
def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
|
||||
def get_parameter_dtype(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
|
||||
"""
|
||||
Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
|
||||
"""
|
||||
@ -1309,6 +1309,7 @@ class ModuleUtilsMixin:
|
||||
return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
|
||||
|
||||
|
||||
# TODO (joao): remove `GenerationMixin` inheritance in v4.50
|
||||
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin, PeftAdapterMixin):
|
||||
r"""
|
||||
Base class for all models.
|
||||
@ -1638,11 +1639,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
Returns:
|
||||
`bool`: Whether this model can generate sequences with `.generate()`.
|
||||
"""
|
||||
# Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
|
||||
# Alternativelly, the model can also have a custom `generate` function.
|
||||
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
|
||||
return False
|
||||
return True
|
||||
# Directly inherits `GenerationMixin` -> can generate
|
||||
if "GenerationMixin" in str(cls.__bases__):
|
||||
return True
|
||||
# Model class overwrites `generate` (e.g. time series models) -> can generate
|
||||
if str(cls.__name__) in str(cls.generate):
|
||||
return True
|
||||
# BC: Detects whether `prepare_inputs_for_generation` has been overwritten in the model. Prior to v4.45, this
|
||||
# was how we detected whether a model could generate.
|
||||
if "GenerationMixin" not in str(cls.prepare_inputs_for_generation):
|
||||
logger.warning_once(
|
||||
f"{cls.__name__} has generative capabilities, as `prepare_inputs_for_generation` is explicitly "
|
||||
"overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, "
|
||||
"`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability "
|
||||
"to call `generate` and other related functions."
|
||||
"\n - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the "
|
||||
"model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes"
|
||||
"\n - If you are the owner of the model architecture code, please modify your model class such that "
|
||||
"it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception)."
|
||||
"\n - If you are not the owner of the model architecture class, please contact the model code owner "
|
||||
"to update it."
|
||||
)
|
||||
return True
|
||||
# Otherwise, can't generate
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _check_and_enable_flash_attn_2(
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -983,7 +984,7 @@ class AlbertSOPHead(nn.Module):
|
||||
"Albert Model with a `language modeling` head on top.",
|
||||
ALBERT_START_DOCSTRING,
|
||||
)
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -29,12 +29,17 @@ from ...utils import (
|
||||
extract_commit_hash,
|
||||
find_adapter_config_file,
|
||||
is_peft_available,
|
||||
is_torch_available,
|
||||
logging,
|
||||
requires_backends,
|
||||
)
|
||||
from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from ...generation import GenerationMixin
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -428,6 +433,7 @@ class _BaseAutoModelClass:
|
||||
model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
|
||||
cls.register(config.__class__, model_class, exist_ok=True)
|
||||
_ = kwargs.pop("code_revision", None)
|
||||
model_class = add_generation_mixin_to_remote_model(model_class)
|
||||
return model_class._from_config(config, **kwargs)
|
||||
elif type(config) in cls._model_mapping.keys():
|
||||
model_class = _get_model_class(config, cls._model_mapping)
|
||||
@ -549,6 +555,7 @@ class _BaseAutoModelClass:
|
||||
)
|
||||
_ = hub_kwargs.pop("code_revision", None)
|
||||
cls.register(config.__class__, model_class, exist_ok=True)
|
||||
model_class = add_generation_mixin_to_remote_model(model_class)
|
||||
return model_class.from_pretrained(
|
||||
pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
|
||||
)
|
||||
@ -698,6 +705,34 @@ def getattribute_from_module(module, attr):
|
||||
raise ValueError(f"Could not find {attr} in {transformers_module}!")
|
||||
|
||||
|
||||
def add_generation_mixin_to_remote_model(model_class):
|
||||
"""
|
||||
Adds `GenerationMixin` to the inheritance of `model_class`, if `model_class` is a PyTorch model.
|
||||
|
||||
This function is used for backwards compatibility purposes: in v4.45, we've started a deprecation cycle to make
|
||||
`PreTrainedModel` stop inheriting from `GenerationMixin`. Without this function, older models dynamically loaded
|
||||
from the Hub may not have the `generate` method after we remove the inheritance.
|
||||
"""
|
||||
# 1. If it is not a PT model (i.e. doesn't inherit Module), do nothing
|
||||
if "torch.nn.modules.module.Module" not in str(model_class.__mro__):
|
||||
return model_class
|
||||
|
||||
# 2. If it already **directly** inherits from GenerationMixin, do nothing
|
||||
if "GenerationMixin" in str(model_class.__bases__):
|
||||
return model_class
|
||||
|
||||
# 3. Prior to v4.45, we could detect whether a model was `generate`-compatible if it had its own `generate` and/or
|
||||
# `prepare_inputs_for_generation` method.
|
||||
has_custom_generate = "GenerationMixin" not in str(getattr(model_class, "generate"))
|
||||
has_custom_prepare_inputs = "GenerationMixin" not in str(getattr(model_class, "prepare_inputs_for_generation"))
|
||||
if has_custom_generate or has_custom_prepare_inputs:
|
||||
model_class_with_generation_mixin = type(
|
||||
model_class.__name__, (model_class, GenerationMixin), {**model_class.__dict__}
|
||||
)
|
||||
return model_class_with_generation_mixin
|
||||
return model_class
|
||||
|
||||
|
||||
class _LazyAutoMapping(OrderedDict):
|
||||
"""
|
||||
" A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
|
||||
|
@ -22,6 +22,7 @@ import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...generation.logits_process import (
|
||||
AlternatingCodebooksLogitsProcessor,
|
||||
BarkEosPrioritizerLogitsProcessor,
|
||||
@ -546,7 +547,7 @@ BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
|
||||
|
||||
|
||||
# GPT2-like autoregressive model
|
||||
class BarkCausalModel(BarkPreTrainedModel):
|
||||
class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
|
||||
config_class = BarkSubModelConfig
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
@ -1557,7 +1558,7 @@ class BartModel(BartPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
|
||||
)
|
||||
class BartForConditionalGeneration(BartPreTrainedModel):
|
||||
class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
@ -2010,7 +2011,7 @@ class BartDecoderWrapper(BartPreTrainedModel):
|
||||
""",
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartForCausalLM(BartPreTrainedModel):
|
||||
class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -28,6 +28,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
_prepare_4d_causal_attention_mask_for_sdpa,
|
||||
@ -1280,7 +1281,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
|
||||
)
|
||||
class BertLMHeadModel(BertPreTrainedModel):
|
||||
class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -23,6 +23,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
@ -863,7 +864,7 @@ class BertGenerationOnlyLMHead(nn.Module):
|
||||
"""BertGeneration Model with a `language modeling` head on top for CLM fine-tuning.""",
|
||||
BERT_GENERATION_START_DOCSTRING,
|
||||
)
|
||||
class BertGenerationDecoder(BertGenerationPreTrainedModel):
|
||||
class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -2495,7 +2496,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
|
||||
)
|
||||
class BigBirdForCausalLM(BigBirdPreTrainedModel):
|
||||
class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -2436,7 +2437,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
|
||||
BIGBIRD_PEGASUS_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
|
||||
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
|
||||
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
@ -2882,7 +2883,7 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
|
||||
return self.decoder(*args, **kwargs)
|
||||
|
||||
|
||||
class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
|
||||
class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -23,6 +23,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -719,7 +720,7 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
|
||||
)
|
||||
class BioGptForCausalLM(BioGptPreTrainedModel):
|
||||
class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["output_projection.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1196,7 +1197,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING
|
||||
)
|
||||
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
|
||||
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1397,7 +1398,7 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
|
||||
class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
|
||||
class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1163,7 +1164,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
|
||||
"The BlenderbotSmall Model with a language modeling head. Can be used for summarization.",
|
||||
BLENDERBOT_SMALL_START_DOCSTRING,
|
||||
)
|
||||
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
|
||||
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1349,7 +1350,7 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
|
||||
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
|
||||
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn.functional import normalize
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
@ -1035,7 +1036,7 @@ class BlipModel(BlipPreTrainedModel):
|
||||
""",
|
||||
BLIP_START_DOCSTRING,
|
||||
)
|
||||
class BlipForConditionalGeneration(BlipPreTrainedModel):
|
||||
class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin):
|
||||
config_class = BlipConfig
|
||||
_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
|
||||
main_input_name = "pixel_values"
|
||||
|
@ -23,6 +23,7 @@ from torch import Tensor, device, nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -808,7 +809,7 @@ class BlipTextModel(BlipTextPreTrainedModel):
|
||||
|
||||
|
||||
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
|
||||
class BlipTextLMHeadModel(BlipTextPreTrainedModel):
|
||||
class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -2006,7 +2007,7 @@ class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
|
||||
""",
|
||||
BLIP_2_START_DOCSTRING,
|
||||
)
|
||||
class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
|
||||
class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
config_class = Blip2Config
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
|
@ -26,6 +26,7 @@ from torch.nn import functional as F
|
||||
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -860,7 +861,7 @@ class BloomModel(BloomPreTrainedModel):
|
||||
""",
|
||||
BLOOM_START_DOCSTRING,
|
||||
)
|
||||
class BloomForCausalLM(BloomPreTrainedModel):
|
||||
class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: BloomConfig):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN, gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
_prepare_4d_causal_attention_mask_for_sdpa,
|
||||
@ -1544,7 +1545,7 @@ class CamembertForQuestionAnswering(CamembertPreTrainedModel):
|
||||
"""CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base
|
||||
class CamembertForCausalLM(CamembertPreTrainedModel):
|
||||
class CamembertForCausalLM(CamembertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -1496,7 +1497,7 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
||||
"Chameleon Model with a head on top used for outputting logits for next token prediction.",
|
||||
CHAMELEON_START_DOCSTRING,
|
||||
)
|
||||
class ChameleonForConditionalGeneration(ChameleonPreTrainedModel):
|
||||
class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,7 +26,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationConfig
|
||||
from ...generation import GenerationConfig, GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1278,7 +1278,7 @@ class ClvpModel(ClvpPreTrainedModel):
|
||||
"The CLVP decoder model with a language modelling head on top.",
|
||||
CLVP_START_DOCSTRING,
|
||||
)
|
||||
class ClvpForCausalLM(ClvpPreTrainedModel):
|
||||
class ClvpForCausalLM(ClvpPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1509,7 +1509,7 @@ class ClvpForCausalLM(ClvpPreTrainedModel):
|
||||
"together to filter out the best speech_ids.",
|
||||
CLVP_START_DOCSTRING,
|
||||
)
|
||||
class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
|
||||
class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin):
|
||||
config_class = ClvpConfig
|
||||
|
||||
def __init__(self, config: ClvpConfig):
|
||||
|
@ -23,6 +23,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
@ -702,7 +703,7 @@ class CodeGenModel(CodeGenPreTrainedModel):
|
||||
""",
|
||||
CODEGEN_START_DOCSTRING,
|
||||
)
|
||||
class CodeGenForCausalLM(CodeGenPreTrainedModel):
|
||||
class CodeGenForCausalLM(CodeGenPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -32,6 +32,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1068,7 +1069,7 @@ class CohereModel(CoherePreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
|
||||
class CohereForCausalLM(CoherePreTrainedModel):
|
||||
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
# Ignore copy
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
||||
@ -736,7 +737,7 @@ class CpmAntModel(CpmAntPreTrainedModel):
|
||||
""",
|
||||
CPMANT_START_DOCSTRING,
|
||||
)
|
||||
class CpmAntForCausalLM(CpmAntPreTrainedModel):
|
||||
class CpmAntForCausalLM(CpmAntPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: CpmAntConfig):
|
||||
|
@ -22,6 +22,7 @@ import torch
|
||||
from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
@ -503,7 +504,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
""",
|
||||
CTRL_START_DOCSTRING,
|
||||
)
|
||||
class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -23,6 +23,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN, gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -866,7 +867,7 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
|
||||
)
|
||||
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
|
||||
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -23,6 +23,7 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
@ -1227,7 +1228,7 @@ class DbrxModel(DbrxPreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings("The DBRX Model transformer for causal language modeling.", DBRX_START_DOCSTRING)
|
||||
class DbrxForCausalLM(DbrxPreTrainedModel):
|
||||
class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: DbrxConfig):
|
||||
super().__init__(config)
|
||||
self.transformer = DbrxModel(config)
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN, get_activation
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithCrossAttentions,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1524,7 +1525,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
|
||||
)
|
||||
class ElectraForCausalLM(ElectraPreTrainedModel):
|
||||
class ElectraForCausalLM(ElectraPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["generator_lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -1081,7 +1082,7 @@ class ErnieForPreTraining(ErniePreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
|
||||
)
|
||||
class ErnieForCausalLM(ErniePreTrainedModel):
|
||||
class ErnieForCausalLM(ErniePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import functional as F
|
||||
|
||||
from ...activations import get_activation
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
AttentionMaskConverter,
|
||||
)
|
||||
@ -1239,7 +1240,7 @@ class FalconModel(FalconPreTrainedModel):
|
||||
"The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
|
||||
FALCON_START_DOCSTRING,
|
||||
)
|
||||
class FalconForCausalLM(FalconPreTrainedModel):
|
||||
class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: FalconConfig):
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import MambaCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -717,7 +718,7 @@ class FalconMambaModel(FalconMambaPreTrainedModel):
|
||||
FALCONMAMBA_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.mamba.modeling_mamba.MambaForCausalLM with MAMBA->FALCONMAMBA,Mamba->FalconMamba,mamba->falcon_mamba,FalconMambaCache->MambaCache
|
||||
class FalconMambaForCausalLM(FalconMambaPreTrainedModel):
|
||||
class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
MaskedLMOutput,
|
||||
@ -644,7 +645,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
||||
FLAUBERT_START_DOCSTRING,
|
||||
)
|
||||
# Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
|
||||
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
|
||||
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["pred_layer.proj.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -35,6 +35,7 @@ from torch import Tensor, nn
|
||||
from torch.nn import CrossEntropyLoss, LayerNorm
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1173,7 +1174,7 @@ class FSMTModel(PretrainedFSMTModel):
|
||||
@add_start_docstrings(
|
||||
"The FSMT Model with a language modeling head. Can be used for summarization.", FSMT_START_DOCSTRING
|
||||
)
|
||||
class FSMTForConditionalGeneration(PretrainedFSMTModel):
|
||||
class FSMTForConditionalGeneration(PretrainedFSMTModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]
|
||||
|
||||
|
@ -20,6 +20,7 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import CausalLMOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...models.auto.modeling_auto import AutoModelForCausalLM
|
||||
@ -145,7 +146,7 @@ FUYU_INPUTS_DOCSTRING = r"""
|
||||
"Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.",
|
||||
FUYU_START_DOCSTRING,
|
||||
)
|
||||
class FuyuForCausalLM(FuyuPreTrainedModel):
|
||||
class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: FuyuConfig):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
|
@ -34,6 +34,7 @@ from transformers.models.llama.modeling_llama import (
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import CausalLMOutputWithPast
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
@ -527,7 +528,7 @@ class GemmaModel(LlamaModel):
|
||||
|
||||
|
||||
# Example where we ony modify the docstring and call super
|
||||
class GemmaForCausalLM(LlamaForCausalLM):
|
||||
class GemmaForCausalLM(LlamaForCausalLM, GenerationMixin):
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
|
@ -29,6 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -988,7 +989,7 @@ class GemmaModel(GemmaPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class GemmaForCausalLM(GemmaPreTrainedModel):
|
||||
class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -33,6 +33,7 @@ from transformers.models.gemma.modeling_gemma import (
|
||||
)
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
|
||||
|
||||
@ -473,7 +474,7 @@ class Gemma2Model(GemmaModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class Gemma2ForCausalLM(GemmaForCausalLM):
|
||||
class Gemma2ForCausalLM(GemmaForCausalLM, GenerationMixin):
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
|
@ -28,6 +28,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, HybridCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
CausalLMOutputWithPast,
|
||||
@ -931,7 +932,7 @@ class Gemma2Model(Gemma2PreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class Gemma2ForCausalLM(Gemma2PreTrainedModel):
|
||||
class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -27,6 +27,7 @@ from torch.nn import CrossEntropyLoss
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...file_utils import ModelOutput
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1324,7 +1325,7 @@ class GitModel(GitPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""GIT Model with a `language modeling` head on top for autoregressive language modeling.""", GIT_START_DOCSTRING
|
||||
)
|
||||
class GitForCausalLM(GitPreTrainedModel):
|
||||
class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["output.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -28,6 +28,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1182,7 +1183,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
""",
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
@ -1384,7 +1385,7 @@ input sequence).
|
||||
""",
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1040,7 +1041,7 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
||||
""",
|
||||
GPT_BIGCODE_START_DOCSTRING,
|
||||
)
|
||||
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
|
||||
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -917,7 +918,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
|
||||
""",
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
|
||||
class GPTNeoForCausalLM(GPTNeoPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -30,6 +30,7 @@ from ...file_utils import (
|
||||
add_start_docstrings_to_model_forward,
|
||||
replace_return_docstrings,
|
||||
)
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1110,7 +1111,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
|
||||
)
|
||||
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
|
||||
class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["embed_out.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import CrossEntropyLoss
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
||||
@ -815,7 +816,7 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
|
||||
"""GPTNeoXJapanese Model with a `language modeling` head on top for Classifier Model fine-tuning.""",
|
||||
GPT_NEOX_JAPANESE_START_DOCSTRING,
|
||||
)
|
||||
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
|
||||
class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["embed_out.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1011,7 +1012,7 @@ class GPTJModel(GPTJPreTrainedModel):
|
||||
""",
|
||||
GPTJ_START_DOCSTRING,
|
||||
)
|
||||
class GPTJForCausalLM(GPTJPreTrainedModel):
|
||||
class GPTJForCausalLM(GPTJPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -1004,7 +1005,7 @@ class GraniteModel(GranitePreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class GraniteForCausalLM(GranitePreTrainedModel):
|
||||
class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Granite
|
||||
|
@ -23,6 +23,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -1234,7 +1235,7 @@ class GraniteMoeModel(GraniteMoePreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class GraniteMoeForCausalLM(GraniteMoePreTrainedModel):
|
||||
class GraniteMoeForCausalLM(GraniteMoePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: GraniteMoeConfig):
|
||||
|
@ -23,11 +23,12 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
@ -1450,7 +1451,7 @@ class Idefics2Model(Idefics2PreTrainedModel):
|
||||
"""The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """,
|
||||
IDEFICS2_START_DOCSTRING,
|
||||
)
|
||||
class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel):
|
||||
class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch.cuda.amp import autocast
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions,
|
||||
@ -880,7 +881,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
|
||||
""",
|
||||
IMAGEGPT_START_DOCSTRING,
|
||||
)
|
||||
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
|
||||
class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ImageGPTConfig):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1283,7 +1284,7 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
|
||||
""",
|
||||
INSTRUCTBLIP_START_DOCSTRING,
|
||||
)
|
||||
class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
|
||||
class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, GenerationMixin):
|
||||
config_class = InstructBlipConfig
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
|
@ -45,6 +45,7 @@ from transformers.models.instructblip.modeling_instructblip import (
|
||||
InstructBlipVisionModel,
|
||||
)
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@ -128,7 +129,7 @@ class InstructBlipVideoQFormerModel(InstructBlipQFormerModel):
|
||||
pass
|
||||
|
||||
|
||||
class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration):
|
||||
class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration, GenerationMixin):
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
|
@ -30,6 +30,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1292,7 +1293,7 @@ class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
|
||||
""",
|
||||
INSTRUCTBLIPVIDEO_START_DOCSTRING,
|
||||
)
|
||||
class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel):
|
||||
class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel, GenerationMixin):
|
||||
config_class = InstructBlipVideoConfig
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
|
@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache # we need __iter__ and __len__ of pkv
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
AttentionMaskConverter,
|
||||
)
|
||||
@ -1424,7 +1425,7 @@ class JambaModel(JambaPreTrainedModel):
|
||||
|
||||
|
||||
# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
|
||||
class JambaForCausalLM(JambaPreTrainedModel):
|
||||
class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: JambaConfig):
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import functional as F
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
MoeCausalLMOutputWithPast,
|
||||
@ -1202,7 +1203,7 @@ class JetMoeModel(JetMoePreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class JetMoeForCausalLM(JetMoePreTrainedModel):
|
||||
class JetMoeForCausalLM(JetMoePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1521,7 +1522,7 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
""",
|
||||
KOSMOS2_START_DOCSTRING,
|
||||
)
|
||||
class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel):
|
||||
class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
config_class = Kosmos2TextConfig
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
@ -1864,7 +1865,7 @@ class Kosmos2Model(Kosmos2PreTrainedModel):
|
||||
""",
|
||||
KOSMOS2_START_DOCSTRING,
|
||||
)
|
||||
class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel):
|
||||
class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
config_class = Kosmos2Config
|
||||
main_input_name = "pixel_values"
|
||||
_tied_weights_keys = ["text_model.lm_head.weight"]
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -2298,7 +2299,7 @@ class LEDModel(LEDPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The LED Model with a language modeling head. Can be used for summarization.", LED_START_DOCSTRING
|
||||
)
|
||||
class LEDForConditionalGeneration(LEDPreTrainedModel):
|
||||
class LEDForConditionalGeneration(LEDPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "led"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
@ -28,6 +28,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -1101,7 +1102,7 @@ class LlamaModel(LlamaPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class LlamaForCausalLM(LlamaPreTrainedModel):
|
||||
class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -21,9 +21,10 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
@ -237,7 +238,7 @@ LLAVA_INPUTS_DOCSTRING = r"""
|
||||
"""The LLAVA model which consists of a vision backbone and a language model.""",
|
||||
LLAVA_START_DOCSTRING,
|
||||
)
|
||||
class LlavaForConditionalGeneration(LlavaPreTrainedModel):
|
||||
class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: LlavaConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config.vision_config)
|
||||
|
@ -23,10 +23,11 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
@ -349,7 +350,7 @@ LLAVA_NEXT_INPUTS_DOCSTRING = r"""
|
||||
"""The LLAVA-NeXT model which consists of a vision backbone and a language model.""",
|
||||
LLAVA_NEXT_START_DOCSTRING,
|
||||
)
|
||||
class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
|
||||
class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: LlavaNextConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config.vision_config)
|
||||
|
@ -29,6 +29,7 @@ from transformers.models.llava_next.modeling_llava_next import (
|
||||
image_size_to_num_patches,
|
||||
)
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...utils import (
|
||||
logging,
|
||||
replace_return_docstrings,
|
||||
@ -218,7 +219,7 @@ class LlavaNextVideoMultiModalProjector(LlavaNextMultiModalProjector):
|
||||
pass
|
||||
|
||||
|
||||
class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
||||
class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration, GenerationMixin):
|
||||
def __init__(self, config: LlavaNextVideoConfig, **super_kwargs):
|
||||
super().__init__(config, **super_kwargs)
|
||||
self.vision_resampler = LlavaNextVideoPooler(config)
|
||||
|
@ -29,10 +29,11 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
@ -387,7 +388,7 @@ LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING = r"""
|
||||
"""The LLAVA-NeXT model which consists of a vision backbone and a language model.""",
|
||||
LLAVA_NEXT_VIDEO_START_DOCSTRING,
|
||||
)
|
||||
class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
|
||||
class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, GenerationMixin):
|
||||
def __init__(
|
||||
self,
|
||||
config: LlavaNextVideoConfig,
|
||||
|
@ -23,10 +23,11 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
logging,
|
||||
@ -358,7 +359,7 @@ LLAVA_ONEVISION_INPUTS_DOCSTRING = r"""
|
||||
"""The LLaVA-Onevision model which consists of a vision backbone and a language model.""",
|
||||
LLAVA_ONEVISION_START_DOCSTRING,
|
||||
)
|
||||
class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel):
|
||||
class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: LlavaOnevisionConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1900,7 +1901,7 @@ class LongT5Model(LongT5PreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
|
||||
class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
|
||||
class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
|
||||
]
|
||||
|
@ -22,6 +22,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
@ -1342,7 +1343,7 @@ class M2M100Model(M2M100PreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The M2M100 Model with a language modeling head. Can be used for summarization.", M2M_100_START_DOCSTRING
|
||||
)
|
||||
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
|
||||
class M2M100ForConditionalGeneration(M2M100PreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import MambaCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -657,7 +658,7 @@ class MambaModel(MambaPreTrainedModel):
|
||||
""",
|
||||
MAMBA_START_DOCSTRING,
|
||||
)
|
||||
class MambaForCausalLM(MambaPreTrainedModel):
|
||||
class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -932,7 +933,7 @@ class Mamba2Model(Mamba2PreTrainedModel):
|
||||
""",
|
||||
MAMBA2_START_DOCSTRING,
|
||||
)
|
||||
class Mamba2ForCausalLM(Mamba2PreTrainedModel):
|
||||
class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = []
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1224,7 +1225,7 @@ class MarianModel(MarianPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The Marian Model with a language modeling head. Can be used for summarization.", MARIAN_START_DOCSTRING
|
||||
)
|
||||
class MarianMTModel(MarianPreTrainedModel):
|
||||
class MarianMTModel(MarianPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"final_logits_bias",
|
||||
@ -1504,7 +1505,7 @@ class MarianDecoderWrapper(MarianPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
|
||||
class MarianForCausalLM(MarianPreTrainedModel):
|
||||
class MarianForCausalLM(MarianPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
@ -1526,7 +1527,7 @@ class MBartModel(MBartPreTrainedModel):
|
||||
"The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
|
||||
MBART_START_DOCSTRING,
|
||||
)
|
||||
class MBartForConditionalGeneration(MBartPreTrainedModel):
|
||||
class MBartForConditionalGeneration(MBartPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1967,7 +1968,7 @@ class MBartDecoderWrapper(MBartPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
|
||||
class MBartForCausalLM(MBartPreTrainedModel):
|
||||
class MBartForCausalLM(MBartPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -27,6 +27,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -1110,7 +1111,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
|
||||
"""MegatronBert Model with a `language modeling` head on top for CLM fine-tuning.""",
|
||||
MEGATRON_BERT_START_DOCSTRING,
|
||||
)
|
||||
class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
|
||||
class MegatronBertForCausalLM(MegatronBertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["cls.predictions.decoder"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -29,6 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -950,7 +951,7 @@ class MistralModel(MistralPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class MistralForCausalLM(MistralPreTrainedModel):
|
||||
class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
MoeCausalLMOutputWithPast,
|
||||
@ -1186,7 +1187,7 @@ class MixtralModel(MixtralPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class MixtralForCausalLM(MixtralPreTrainedModel):
|
||||
class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
|
||||
from torch.nn import functional as F
|
||||
|
||||
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -500,7 +501,7 @@ class MptModel(MptPreTrainedModel):
|
||||
""",
|
||||
MPT_START_DOCSTRING,
|
||||
)
|
||||
class MptForCausalLM(MptPreTrainedModel):
|
||||
class MptForCausalLM(MptPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config: MptConfig):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1550,7 +1551,7 @@ class MT5Model(MT5PreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
|
||||
class MT5ForConditionalGeneration(MT5PreTrainedModel):
|
||||
class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
||||
r"""
|
||||
Examples:
|
||||
|
||||
|
@ -26,9 +26,14 @@ import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation.configuration_utils import GenerationConfig, GenerationMode
|
||||
from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
|
||||
from ...generation.stopping_criteria import StoppingCriteriaList
|
||||
from ...generation import (
|
||||
ClassifierFreeGuidanceLogitsProcessor,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
GenerationMode,
|
||||
LogitsProcessorList,
|
||||
StoppingCriteriaList,
|
||||
)
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
@ -1206,7 +1211,7 @@ class MusicgenModel(MusicgenPreTrainedModel):
|
||||
"The MusicGen decoder model with a language modelling head on top.",
|
||||
MUSICGEN_START_DOCSTRING,
|
||||
)
|
||||
class MusicgenForCausalLM(MusicgenPreTrainedModel):
|
||||
class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: MusicgenDecoderConfig):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1658,7 +1663,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel):
|
||||
"for music generation tasks with one or both of text and audio prompts.",
|
||||
MUSICGEN_START_DOCSTRING,
|
||||
)
|
||||
class MusicgenForConditionalGeneration(PreTrainedModel):
|
||||
class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
||||
config_class = MusicgenConfig
|
||||
base_model_prefix = "encoder_decoder"
|
||||
main_input_name = "input_ids"
|
||||
|
@ -26,9 +26,14 @@ import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation.configuration_utils import GenerationConfig, GenerationMode
|
||||
from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
|
||||
from ...generation.stopping_criteria import StoppingCriteriaList
|
||||
from ...generation import (
|
||||
ClassifierFreeGuidanceLogitsProcessor,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
GenerationMode,
|
||||
LogitsProcessorList,
|
||||
StoppingCriteriaList,
|
||||
)
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1117,7 +1122,7 @@ class MusicgenMelodyModel(MusicgenMelodyPreTrainedModel):
|
||||
MUSICGEN_MELODY_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForCausalLM with MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody,MusicGen->Musicgen Melody
|
||||
class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
|
||||
class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: MusicgenMelodyDecoderConfig):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1585,7 +1590,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
|
||||
decoder (`Optional[MusicgenMelodyForCausalLM]`, *optional*): MusicGen Melody decoder used to generate audio codes.
|
||||
""",
|
||||
)
|
||||
class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
|
||||
class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
||||
config_class = MusicgenMelodyConfig
|
||||
main_input_name = "input_ids"
|
||||
supports_gradient_checkpointing = True
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1351,7 +1352,7 @@ class MvpModel(MvpPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
|
||||
)
|
||||
class MvpForConditionalGeneration(MvpPreTrainedModel):
|
||||
class MvpForConditionalGeneration(MvpPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: MvpConfig):
|
||||
@ -1791,7 +1792,7 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
|
||||
return self.decoder(*args, **kwargs)
|
||||
|
||||
|
||||
class MvpForCausalLM(MvpPreTrainedModel):
|
||||
class MvpForCausalLM(MvpPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import _flash_attention_forward
|
||||
from ...modeling_outputs import (
|
||||
@ -980,7 +981,7 @@ class NemotronModel(NemotronPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
|
||||
class NemotronForCausalLM(NemotronPreTrainedModel):
|
||||
class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
@ -1604,7 +1605,7 @@ class NllbMoeModel(NllbMoePreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The NllbMoe Model with a language modeling head. Can be used for summarization.", NLLB_MOE_START_DOCSTRING
|
||||
)
|
||||
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
|
||||
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
|
@ -30,6 +30,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1022,7 +1023,7 @@ class OlmoModel(OlmoPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
|
||||
class OlmoForCausalLM(OlmoPreTrainedModel):
|
||||
class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
MoeCausalLMOutputWithPast,
|
||||
@ -1173,7 +1174,7 @@ class OlmoeModel(OlmoePreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class OlmoeForCausalLM(OlmoePreTrainedModel):
|
||||
class OlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -26,6 +26,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import gelu_new, silu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
|
||||
from ...modeling_utils import PreTrainedModel, SequenceSummary
|
||||
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
|
||||
@ -524,7 +525,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
""",
|
||||
OPENAI_GPT_START_DOCSTRING,
|
||||
)
|
||||
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -882,7 +883,7 @@ class OPTModel(OPTPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class OPTForCausalLM(OPTPreTrainedModel):
|
||||
class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,6 +22,7 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...cache_utils import Cache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
@ -302,7 +303,7 @@ PALIGEMMA_INPUTS_DOCSTRING = r"""
|
||||
"""The PALIGEMMA model which consists of a vision backbone and a language model.""",
|
||||
PALIGEMMA_START_DOCSTRING,
|
||||
)
|
||||
class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
|
||||
class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: PaliGemmaConfig):
|
||||
super().__init__(config)
|
||||
self.vision_tower = AutoModel.from_config(config=config.vision_config)
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1244,7 +1245,7 @@ class PegasusModel(PegasusPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"The PEGASUS Model with a language modeling head. Can be used for summarization.", PEGASUS_START_DOCSTRING
|
||||
)
|
||||
class PegasusForConditionalGeneration(PegasusPreTrainedModel):
|
||||
class PegasusForConditionalGeneration(PegasusPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1456,7 +1457,7 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
|
||||
return self.decoder(*args, **kwargs)
|
||||
|
||||
|
||||
class PegasusForCausalLM(PegasusPreTrainedModel):
|
||||
class PegasusForCausalLM(PegasusPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1464,7 +1465,7 @@ class PegasusXModel(PegasusXPreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING)
|
||||
class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
|
||||
class PegasusXForConditionalGeneration(PegasusXPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
|
@ -29,6 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -847,7 +848,7 @@ class PersimmonModel(PersimmonPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class PersimmonForCausalLM(PersimmonPreTrainedModel):
|
||||
class PersimmonForCausalLM(PersimmonPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with LLAMA->PERSIMMON,Llama->Persimmon
|
||||
|
@ -26,6 +26,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1139,7 +1140,7 @@ class PhiModel(PhiPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class PhiForCausalLM(PhiPreTrainedModel):
|
||||
class PhiForCausalLM(PhiPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi,bias=False->bias=True
|
||||
|
@ -26,6 +26,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1160,7 +1161,7 @@ class Phi3Model(Phi3PreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class Phi3ForCausalLM(Phi3PreTrainedModel):
|
||||
class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
|
||||
|
@ -22,6 +22,7 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPooling,
|
||||
@ -1553,7 +1554,7 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel):
|
||||
"A conditional generation model with a language modeling head. Can be used for sequence generation tasks.",
|
||||
PIX2STRUCT_START_DOCSTRING,
|
||||
)
|
||||
class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
|
||||
class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMixin):
|
||||
config_class = Pix2StructConfig
|
||||
main_input_name = "flattened_patches"
|
||||
_tied_weights_keys = ["decoder.lm_head.weight"]
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
@ -1254,7 +1255,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
||||
"The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.",
|
||||
PLBART_START_DOCSTRING,
|
||||
)
|
||||
class PLBartForConditionalGeneration(PLBartPreTrainedModel):
|
||||
class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
|
||||
base_model_prefix = "model"
|
||||
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
@ -1568,7 +1569,7 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
|
||||
class PLBartForCausalLM(PLBartPreTrainedModel):
|
||||
class PLBartForCausalLM(PLBartPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch.nn import CrossEntropyLoss
|
||||
from transformers.generation import GenerationConfig
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1001,7 +1002,7 @@ Pop2Piano_START_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings("""Pop2Piano Model with a `language modeling` head on top.""", Pop2Piano_START_DOCSTRING)
|
||||
class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel):
|
||||
class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: Pop2PianoConfig):
|
||||
|
@ -26,6 +26,7 @@ from torch import Tensor, nn
|
||||
from torch.nn import LayerNorm
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
@ -1856,7 +1857,7 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
|
||||
"The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
|
||||
PROPHETNET_START_DOCSTRING,
|
||||
)
|
||||
class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
|
||||
class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
|
||||
|
||||
def __init__(self, config: ProphetNetConfig):
|
||||
@ -2073,7 +2074,7 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
|
||||
" language modeling.",
|
||||
PROPHETNET_START_DOCSTRING,
|
||||
)
|
||||
class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
|
||||
class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = [
|
||||
"prophetnet.word_embeddings.weight",
|
||||
"prophetnet.decoder.word_embeddings.weight",
|
||||
|
@ -29,6 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
@ -1078,7 +1079,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class Qwen2ForCausalLM(Qwen2PreTrainedModel):
|
||||
class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -22,10 +22,11 @@ import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ... import PreTrainedModel
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, EncoderDecoderCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
@ -855,7 +856,7 @@ QWEN2AUDIO_INPUTS_DOCSTRING = r"""
|
||||
"""The QWEN2AUDIO model which consists of a audio backbone and a language model.""",
|
||||
QWEN2AUDIO_START_DOCSTRING,
|
||||
)
|
||||
class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel):
|
||||
class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: Qwen2AudioConfig):
|
||||
super().__init__(config)
|
||||
self.audio_tower = AutoModel.from_config(config.audio_config, attn_implementation=config._attn_implementation)
|
||||
|
@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import (
|
||||
MoeCausalLMOutputWithPast,
|
||||
@ -1253,7 +1254,7 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
|
||||
class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -31,6 +31,7 @@ from torch.nn import CrossEntropyLoss, LayerNorm
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
AttentionMaskConverter,
|
||||
)
|
||||
@ -1416,7 +1417,7 @@ QWEN2_VL_INPUTS_DOCSTRING = r"""
|
||||
"""
|
||||
|
||||
|
||||
class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
|
||||
class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
@ -777,7 +778,7 @@ class RecurrentGemmaModel(RecurrentGemmaPreTrainedModel):
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->RECURRENTGEMMA,Llama->RecurrentGemma,llama->gemma
|
||||
class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel):
|
||||
class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -29,6 +29,7 @@ from torch.autograd.function import Function
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import apply_chunking_to_forward
|
||||
@ -2183,7 +2184,7 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
|
||||
class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
class ReformerModelWithLMHead(ReformerPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -1002,7 +1003,7 @@ class RemBertForMaskedLM(RemBertPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
|
||||
)
|
||||
class RemBertForCausalLM(RemBertPreTrainedModel):
|
||||
class RemBertForCausalLM(RemBertPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["cls.predictions.decoder.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN, gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
_prepare_4d_causal_attention_mask_for_sdpa,
|
||||
@ -1003,7 +1004,7 @@ class RobertaModel(RobertaPreTrainedModel):
|
||||
@add_start_docstrings(
|
||||
"""RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
|
||||
)
|
||||
class RobertaForCausalLM(RobertaPreTrainedModel):
|
||||
class RobertaForCausalLM(RobertaPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN, gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -855,7 +856,7 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
|
||||
ROBERTA_PRELAYERNORM_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
|
||||
class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
|
||||
class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user