This commit is contained in:
Arthur 2025-06-30 12:43:56 +02:00
parent 0dc082627c
commit fca73ad7ce
49 changed files with 13 additions and 146 deletions

View File

@ -805,9 +805,6 @@ class AriaTextModel(AriaTextPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -1329,9 +1329,6 @@ class AriaTextModel(LlamaModel):
self.post_init()
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -339,9 +339,6 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The AYA_VISION model which consists of a vision backbone and a language model.

View File

@ -691,9 +691,6 @@ class BioGptModel(BioGptPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
BioGPT Model with a `language modeling` head on top for CLM fine-tuning.

View File

@ -517,9 +517,6 @@ class BioGptModel(BioGptPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
BioGPT Model with a `language modeling` head on top for CLM fine-tuning.

View File

@ -25,7 +25,6 @@ from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...generation import GenerationMixin
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import (
BaseModelOutput,
@ -1249,9 +1248,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer

View File

@ -1177,9 +1177,6 @@ class ChameleonModel(ChameleonPreTrainedModel):
return causal_mask
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
Chameleon Model with a head on top used for outputting logits for next token prediction.

View File

@ -463,9 +463,6 @@ class CohereModel(CoherePreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -292,9 +292,6 @@ class CohereModel(LlamaModel):
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class CohereForCausalLM(LlamaForCausalLM):
def __init__(self, config):
super().__init__(config)

View File

@ -488,9 +488,6 @@ class Cohere2Model(Cohere2PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -21,11 +21,10 @@ import torch.utils.checkpoint
from torch import nn
from ...generation import GenerationMixin
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...models.auto.modeling_auto import AutoModel
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils import auto_docstring, can_return_tuple, logging
from .configuration_fuyu import FuyuConfig
@ -56,9 +55,6 @@ class FuyuPreTrainedModel(PreTrainedModel):
module.weight.data[module.padding_idx].zero_()
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.

View File

@ -279,9 +279,6 @@ class Glm4Attention(nn.Module):
return attn_output, attn_weights
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@use_kernel_forward_from_hub("RMSNorm")
class Glm4RMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):

View File

@ -100,9 +100,6 @@ class Glm4Attention(GlmAttention):
pass
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Glm4ForCausalLM(GlmForCausalLM):
def forward(
self,

View File

@ -756,9 +756,6 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
return outputs
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@dataclass
@auto_docstring(
custom_intro="""

View File

@ -911,9 +911,6 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
return outputs
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
pass

View File

@ -691,9 +691,6 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The GOT_OCR2 model which consists of a vision backbone and a language model.

View File

@ -519,9 +519,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.

View File

@ -393,9 +393,6 @@ class GPTNeoXModel(LlamaModel, nn.Module):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.

View File

@ -490,9 +490,6 @@ class GraniteModel(GranitePreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -227,9 +227,6 @@ class GraniteModel(LlamaModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class GraniteForCausalLM(LlamaForCausalLM):
def forward(
self,

View File

@ -923,9 +923,6 @@ class IdeficsPreTrainedModel(PreTrainedModel):
module.latents.data.normal_()
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class IdeficsModel(IdeficsPreTrainedModel):
"""

View File

@ -1106,9 +1106,6 @@ class Idefics2Model(Idefics2PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.

View File

@ -821,9 +821,6 @@ class Idefics3Model(Idefics3PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The Idefics3 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.

View File

@ -1182,9 +1182,6 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
InstructBLIP base Model consisting of language model, qformer and vision encoder.

View File

@ -840,9 +840,6 @@ class InstructBlipVideoQFormerEmbeddings(nn.Module):
return embeddings
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class InstructBlipVideoPreTrainedModel(PreTrainedModel):
config_class = InstructBlipVideoConfig

View File

@ -813,9 +813,6 @@ class InternVLCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[torch.FloatTensor] = None
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The INTERNVL model which consists of a vision backbone and a language model.

View File

@ -1510,7 +1510,7 @@ class JambaForSequenceClassification(JambaPreTrainedModel):
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
**kwargs,
**kwargs: Unpack[TransformersKwargs],
) -> SequenceClassifierOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):

View File

@ -1346,9 +1346,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input

View File

@ -603,9 +603,6 @@ class Llama4TextModel(Llama4PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
_no_split_modules = ["Llama4TextDecoderLayer"]
base_model_prefix = "language_model"

View File

@ -321,9 +321,6 @@ class LlavaModel(LlavaPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The LLAVA model which consists of a vision backbone and a language model.

View File

@ -518,9 +518,6 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The LLAVA-NeXT model which consists of a vision backbone and a language model.

View File

@ -648,9 +648,6 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
return video_features
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The LLAVA-NeXT model which consists of a vision backbone and a language model.

View File

@ -689,9 +689,6 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
return image_features
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The LLAVA-NeXT model which consists of a vision backbone and a language model.

View File

@ -792,9 +792,6 @@ class MiniMaxModel(MiniMaxPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
num_experts: Optional[int] = None,

View File

@ -357,9 +357,6 @@ class Mistral3Model(Mistral3PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The MISTRAL3 model which consists of a vision backbone and a language model.

View File

@ -575,9 +575,6 @@ class MixtralModel(MixtralPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
num_experts: Optional[int] = None,

View File

@ -422,9 +422,6 @@ class MixtralModel(MistralModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class MixtralForCausalLM(MistralForCausalLM):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -1459,9 +1459,6 @@ class MllamaTextModel(MllamaPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The Mllama Text Model with a language modeling head on top.

View File

@ -776,9 +776,6 @@ class OPTModel(OPTPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -377,9 +377,6 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,

View File

@ -933,9 +933,6 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
base_model_prefix = ""

View File

@ -909,9 +909,6 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class Qwen2VLModel(Qwen2VLPreTrainedModel):
base_model_prefix = ""

View File

@ -481,9 +481,6 @@ class Qwen3Model(Qwen3PreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -116,9 +116,6 @@ class Qwen3Model(Qwen2Model):
pass
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Qwen3ForCausalLM(Qwen2ForCausalLM):
def forward(
self,

View File

@ -582,9 +582,6 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
num_experts: Optional[int] = None,

View File

@ -225,9 +225,6 @@ class Qwen3MoeModel(MixtralModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Qwen3MoeForCausalLM(MixtralForCausalLM):
def __init__(self, config):
super().__init__(config)

View File

@ -799,9 +799,6 @@ class SmolVLMCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The SmolVLM Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.

View File

@ -393,9 +393,6 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
)
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
custom_intro="""
The VideoLlava model which consists of a vision backbone and a language model.

View File

@ -995,7 +995,17 @@ def check_model_inputs(func):
# TODO @Lysandre add the head we have today about GC and training
# and all of the rest that is general transformers checking
# THIS PART :
# if self.gradient_checkpointing and self.training and use_cache:
# logger.warning_once(
# "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
# )
# use_cache = False
#
# # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
# if not isinstance(past_key_values, (type(None), Cache)):
# raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
#
hooks = []
collected_outputs = defaultdict(list)