mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
update
This commit is contained in:
parent
0dc082627c
commit
fca73ad7ce
@ -805,9 +805,6 @@ class AriaTextModel(AriaTextPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
@ -1329,9 +1329,6 @@ class AriaTextModel(LlamaModel):
|
||||
self.post_init()
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
|
@ -339,9 +339,6 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The AYA_VISION model which consists of a vision backbone and a language model.
|
||||
|
@ -691,9 +691,6 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
|
||||
|
@ -517,9 +517,6 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
|
||||
|
@ -25,7 +25,6 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -1249,9 +1248,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer
|
||||
|
@ -1177,9 +1177,6 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Chameleon Model with a head on top used for outputting logits for next token prediction.
|
||||
|
@ -463,9 +463,6 @@ class CohereModel(CoherePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
@ -292,9 +292,6 @@ class CohereModel(LlamaModel):
|
||||
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class CohereForCausalLM(LlamaForCausalLM):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -488,9 +488,6 @@ class Cohere2Model(Cohere2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
@ -21,11 +21,10 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import CausalLMOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...models.auto.modeling_auto import AutoModel
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
from ...utils import auto_docstring, can_return_tuple, logging
|
||||
from .configuration_fuyu import FuyuConfig
|
||||
|
||||
|
||||
@ -56,9 +55,6 @@ class FuyuPreTrainedModel(PreTrainedModel):
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
|
||||
|
@ -279,9 +279,6 @@ class Glm4Attention(nn.Module):
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@use_kernel_forward_from_hub("RMSNorm")
|
||||
class Glm4RMSNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-6):
|
||||
|
@ -100,9 +100,6 @@ class Glm4Attention(GlmAttention):
|
||||
pass
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Glm4ForCausalLM(GlmForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
|
@ -756,9 +756,6 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
|
@ -911,9 +911,6 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
|
||||
pass
|
||||
|
||||
|
@ -691,9 +691,6 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The GOT_OCR2 model which consists of a vision backbone and a language model.
|
||||
|
@ -519,9 +519,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.
|
||||
|
@ -393,9 +393,6 @@ class GPTNeoXModel(LlamaModel, nn.Module):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.
|
||||
|
@ -490,9 +490,6 @@ class GraniteModel(GranitePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
@ -227,9 +227,6 @@ class GraniteModel(LlamaModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class GraniteForCausalLM(LlamaForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
|
@ -923,9 +923,6 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
||||
module.latents.data.normal_()
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class IdeficsModel(IdeficsPreTrainedModel):
|
||||
"""
|
||||
|
@ -1106,9 +1106,6 @@ class Idefics2Model(Idefics2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
|
||||
|
@ -821,9 +821,6 @@ class Idefics3Model(Idefics3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Idefics3 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
|
||||
|
@ -1182,9 +1182,6 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
InstructBLIP base Model consisting of language model, qformer and vision encoder.
|
||||
|
@ -840,9 +840,6 @@ class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
||||
return embeddings
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class InstructBlipVideoPreTrainedModel(PreTrainedModel):
|
||||
config_class = InstructBlipVideoConfig
|
||||
|
@ -813,9 +813,6 @@ class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The INTERNVL model which consists of a vision backbone and a language model.
|
||||
|
@ -1510,7 +1510,7 @@ class JambaForSequenceClassification(JambaPreTrainedModel):
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
**kwargs,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> SequenceClassifierOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
|
@ -1346,9 +1346,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
|
||||
|
@ -603,9 +603,6 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
|
||||
_no_split_modules = ["Llama4TextDecoderLayer"]
|
||||
base_model_prefix = "language_model"
|
||||
|
@ -321,9 +321,6 @@ class LlavaModel(LlavaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The LLAVA model which consists of a vision backbone and a language model.
|
||||
|
@ -518,9 +518,6 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The LLAVA-NeXT model which consists of a vision backbone and a language model.
|
||||
|
@ -648,9 +648,6 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
|
||||
return video_features
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The LLAVA-NeXT model which consists of a vision backbone and a language model.
|
||||
|
@ -689,9 +689,6 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
|
||||
return image_features
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The LLAVA-NeXT model which consists of a vision backbone and a language model.
|
||||
|
@ -792,9 +792,6 @@ class MiniMaxModel(MiniMaxPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
@ -357,9 +357,6 @@ class Mistral3Model(Mistral3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The MISTRAL3 model which consists of a vision backbone and a language model.
|
||||
|
@ -575,9 +575,6 @@ class MixtralModel(MixtralPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
@ -422,9 +422,6 @@ class MixtralModel(MistralModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class MixtralForCausalLM(MistralForCausalLM):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
|
@ -1459,9 +1459,6 @@ class MllamaTextModel(MllamaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Mllama Text Model with a language modeling head on top.
|
||||
|
@ -776,9 +776,6 @@ class OPTModel(OPTPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
||||
|
@ -377,9 +377,6 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
|
||||
|
@ -933,9 +933,6 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
||||
base_model_prefix = ""
|
||||
|
@ -909,9 +909,6 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
||||
base_model_prefix = ""
|
||||
|
@ -481,9 +481,6 @@ class Qwen3Model(Qwen3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
|
@ -116,9 +116,6 @@ class Qwen3Model(Qwen2Model):
|
||||
pass
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Qwen3ForCausalLM(Qwen2ForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
|
@ -582,9 +582,6 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
@ -225,9 +225,6 @@ class Qwen3MoeModel(MixtralModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Qwen3MoeForCausalLM(MixtralForCausalLM):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -799,9 +799,6 @@ class SmolVLMCausalLMOutputWithPast(ModelOutput):
|
||||
image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The SmolVLM Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
|
||||
|
@ -393,9 +393,6 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The VideoLlava model which consists of a vision backbone and a language model.
|
||||
|
@ -995,7 +995,17 @@ def check_model_inputs(func):
|
||||
|
||||
# TODO @Lysandre add the head we have today about GC and training
|
||||
# and all of the rest that is general transformers checking
|
||||
|
||||
# THIS PART :
|
||||
# if self.gradient_checkpointing and self.training and use_cache:
|
||||
# logger.warning_once(
|
||||
# "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
|
||||
# )
|
||||
# use_cache = False
|
||||
#
|
||||
# # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
|
||||
# if not isinstance(past_key_values, (type(None), Cache)):
|
||||
# raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
|
||||
#
|
||||
hooks = []
|
||||
collected_outputs = defaultdict(list)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user