This commit is contained in:
Arthur 2025-06-30 12:32:03 +02:00
parent a7e0ce238e
commit c9bb39ef87
59 changed files with 125 additions and 125 deletions

View File

@ -805,7 +805,7 @@ class AriaTextModel(AriaTextPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -1198,7 +1198,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, AriaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1329,7 +1329,7 @@ class AriaTextModel(LlamaModel):
self.post_init()
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
@ -1527,7 +1527,7 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, AriaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -339,7 +339,7 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -427,7 +427,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, AyaVisionCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -20,7 +20,7 @@ import torch
from torch import nn
from transformers.models.llava.modeling_llava import (
KwargsForCausalLM,
TransformersKwargs,
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
@ -279,7 +279,7 @@ class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, AyaVisionCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -691,7 +691,7 @@ class BioGptModel(BioGptPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -732,7 +732,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -517,7 +517,7 @@ class BioGptModel(BioGptPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -558,7 +558,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1249,7 +1249,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1320,7 +1320,7 @@ class Blip2Model(Blip2PreTrainedModel):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
):
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@ -1508,7 +1508,7 @@ class Blip2Model(Blip2PreTrainedModel):
labels: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Blip2ForConditionalGenerationModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1981,7 +1981,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
use_cache: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Blip2ForConditionalGenerationModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1177,7 +1177,7 @@ class ChameleonModel(ChameleonPreTrainedModel):
return causal_mask
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1236,7 +1236,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -463,7 +463,7 @@ class CohereModel(CoherePreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -516,7 +516,7 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -292,7 +292,7 @@ class CohereModel(LlamaModel):
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class CohereForCausalLM(LlamaForCausalLM):
@ -315,7 +315,7 @@ class CohereForCausalLM(LlamaForCausalLM):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -488,7 +488,7 @@ class Cohere2Model(Cohere2PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -541,7 +541,7 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -612,7 +612,7 @@ class CsmDepthDecoderForCausalLM(CsmPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
@ -1015,7 +1015,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CsmOutputWithPast]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):

View File

@ -29,7 +29,7 @@ from ...processing_utils import Unpack
from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
from ..auto import AutoModel
from ..llama.modeling_llama import (
KwargsForCausalLM,
TransformersKwargs,
LlamaAttention,
LlamaDecoderLayer,
LlamaForCausalLM,
@ -370,7 +370,7 @@ class CsmDepthDecoderForCausalLM(LlamaForCausalLM, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
@ -680,7 +680,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CsmOutputWithPast]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):

View File

@ -23,7 +23,7 @@ from ..deepseek_v3.modeling_deepseek_v3 import (
DeepseekV3TopkRouter,
)
from ..qwen3.modeling_qwen3 import (
KwargsForCausalLM,
TransformersKwargs,
Qwen3Attention,
Qwen3ForCausalLM,
Qwen3Model,
@ -77,7 +77,7 @@ class Dots1Model(Qwen3Model):
class Dots1ForCausalLM(Qwen3ForCausalLM):
def forward(
self,
**super_kwargs: Unpack[KwargsForCausalLM],
**super_kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1579,7 +1579,7 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
cache_position: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):

View File

@ -31,7 +31,7 @@ from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import auto_docstring, can_return_tuple, logging
from ..chameleon.modeling_chameleon import ChameleonPreTrainedModel, ChameleonVQVAEEncoderConvDownsample
from ..llama.modeling_llama import KwargsForCausalLM, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel
from ..llama.modeling_llama import TransformersKwargs, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel
from ..siglip.modeling_siglip import SiglipAttention
from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
@ -1131,7 +1131,7 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
cache_position: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):

View File

@ -56,7 +56,7 @@ class FuyuPreTrainedModel(PreTrainedModel):
module.weight.data[module.padding_idx].zero_()
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(

View File

@ -279,7 +279,7 @@ class Glm4Attention(nn.Module):
return attn_output, attn_weights
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@use_kernel_forward_from_hub("RMSNorm")

View File

@ -100,13 +100,13 @@ class Glm4Attention(GlmAttention):
pass
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Glm4ForCausalLM(GlmForCausalLM):
def forward(
self,
**super_kwargs: Unpack[KwargsForCausalLM],
**super_kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -756,7 +756,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
return outputs
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@dataclass
@ -1181,7 +1181,7 @@ class Glm4vModel(Glm4vPreTrainedModel):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Glm4vModelOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
@ -1407,7 +1407,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -911,7 +911,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
return outputs
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
@ -1246,7 +1246,7 @@ class Glm4vModel(Qwen2_5_VLModel):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Glm4vModelOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
@ -1401,7 +1401,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -691,7 +691,7 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -776,7 +776,7 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -21,7 +21,7 @@ import torch.nn as nn
import torch.utils.checkpoint
from transformers.models.llava.modeling_llava import (
KwargsForCausalLM,
TransformersKwargs,
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
@ -400,7 +400,7 @@ class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -519,7 +519,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -563,7 +563,7 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -393,7 +393,7 @@ class GPTNeoXModel(LlamaModel, nn.Module):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -437,7 +437,7 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -490,7 +490,7 @@ class GraniteModel(GranitePreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -541,7 +541,7 @@ class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -227,7 +227,7 @@ class GraniteModel(LlamaModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class GraniteForCausalLM(LlamaForCausalLM):
@ -244,7 +244,7 @@ class GraniteForCausalLM(LlamaForCausalLM):
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (

View File

@ -923,7 +923,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
module.latents.data.normal_()
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -1424,7 +1424,7 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, IdeficsCausalLMOutputWithPast]:
r"""
image_encoder_embeddings (`torch.FloatTensor`, *optional*):

View File

@ -1106,7 +1106,7 @@ class Idefics2Model(Idefics2PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1180,7 +1180,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Idefics2CausalLMOutputWithPast]:
r"""
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):

View File

@ -821,7 +821,7 @@ class Idefics3Model(Idefics3PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -902,7 +902,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
cache_position: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Idefics3CausalLMOutputWithPast]:
r"""
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):

View File

@ -1182,7 +1182,7 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1519,7 +1519,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
use_cache: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, InstructBlipForConditionalGenerationModelOutput]:
r"""
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -840,7 +840,7 @@ class InstructBlipVideoQFormerEmbeddings(nn.Module):
return embeddings
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -1491,7 +1491,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
use_cache: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
r"""
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -29,7 +29,7 @@ from transformers.models.instructblip.modeling_instructblip import (
InstructBlipPreTrainedModel,
InstructBlipQFormerModel,
InstructBlipVisionModel,
KwargsForCausalLM,
TransformersKwargs,
)
from ...configuration_utils import PretrainedConfig
@ -378,7 +378,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
use_cache: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
r"""
```python

View File

@ -813,7 +813,7 @@ class InternVLCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[torch.FloatTensor] = None
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -901,7 +901,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, InternVLCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1346,7 +1346,7 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1400,7 +1400,7 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
r"""
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
@ -1765,7 +1765,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Kosmos2ForConditionalGenerationModelOutput]:
r"""
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -603,7 +603,7 @@ class Llama4TextModel(Llama4PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
@ -656,7 +656,7 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1294,7 +1294,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: torch.Tensor = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Llama4CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -321,7 +321,7 @@ class LlavaModel(LlavaPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -409,7 +409,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -518,7 +518,7 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -614,7 +614,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaNextCausalLMOutputWithPast]:
r"""
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):

View File

@ -648,7 +648,7 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
return video_features
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -746,7 +746,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaNextVideoCausalLMOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):

View File

@ -20,7 +20,7 @@ import torch
from torch import nn
from transformers.models.llava_next.modeling_llava_next import (
KwargsForCausalLM,
TransformersKwargs,
LlavaNextCausalLMOutputWithPast,
LlavaNextForConditionalGeneration,
LlavaNextModel,
@ -546,7 +546,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaNextVideoCausalLMOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):

View File

@ -689,7 +689,7 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
return image_features
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -790,7 +790,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):

View File

@ -21,7 +21,7 @@ from torch import nn
from transformers.models.llava_next.image_processing_llava_next_fast import LlavaNextImageProcessorFast
from transformers.models.llava_next_video.modeling_llava_next_video import (
KwargsForCausalLM,
TransformersKwargs,
LlavaNextVideoCausalLMOutputWithPast,
LlavaNextVideoForConditionalGeneration,
LlavaNextVideoModel,
@ -638,7 +638,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):

View File

@ -792,7 +792,7 @@ class MiniMaxModel(MiniMaxPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
@ -929,7 +929,7 @@ class MiniMaxForCausalLM(MiniMaxPreTrainedModel, GenerationMixin):
output_router_logits: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> MoeCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -357,7 +357,7 @@ class Mistral3Model(Mistral3PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -443,7 +443,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Mistral3CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -23,7 +23,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...processing_utils import Unpack
from ...utils import is_torchdynamo_compiling, logging
from ..llava.modeling_llava import (
KwargsForCausalLM,
TransformersKwargs,
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
@ -284,7 +284,7 @@ class Mistral3ForConditionalGeneration(LlavaForConditionalGeneration):
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Mistral3CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -575,7 +575,7 @@ class MixtralModel(MixtralPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
@ -712,7 +712,7 @@ class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin):
output_router_logits: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> MoeCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -422,7 +422,7 @@ class MixtralModel(MistralModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class MixtralForCausalLM(MistralForCausalLM):
@ -449,7 +449,7 @@ class MixtralForCausalLM(MistralForCausalLM):
output_router_logits: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> MoeCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -1459,7 +1459,7 @@ class MllamaTextModel(MllamaPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -1518,7 +1518,7 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
cross_attention_states (`torch.FloatTensor`, *optional*):
@ -1833,7 +1833,7 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
aspect_ratio_mask (`torch.Tensor` of shape `(batch_size, max_num_images, max_num_tiles)`, *optional*):

View File

@ -776,7 +776,7 @@ class OPTModel(OPTPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
@ -826,7 +826,7 @@ class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
position_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -377,7 +377,7 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -452,7 +452,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, PaliGemmaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -933,7 +933,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -1199,7 +1199,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
@ -1428,7 +1428,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2_5_VLCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -29,7 +29,7 @@ import torch.utils.checkpoint
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
KwargsForCausalLM,
TransformersKwargs,
PatchEmbed,
PatchMerger,
Qwen2RMSNorm,
@ -557,7 +557,7 @@ class Qwen2_5_VLModel(Qwen2VLModel):
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
@ -710,7 +710,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2_5_VLCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -909,7 +909,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring
@ -1139,7 +1139,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2VLModelOutputWithPast]:
r"""
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
@ -1328,7 +1328,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, Qwen2VLCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -481,7 +481,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring

View File

@ -116,13 +116,13 @@ class Qwen3Model(Qwen2Model):
pass
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Qwen3ForCausalLM(Qwen2ForCausalLM):
def forward(
self,
**super_kwargs: Unpack[KwargsForCausalLM],
**super_kwargs: Unpack[TransformersKwargs],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -582,7 +582,7 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
def load_balancing_loss_func(
@ -719,7 +719,7 @@ class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin):
output_router_logits: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> MoeCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -225,7 +225,7 @@ class Qwen3MoeModel(MixtralModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
class Qwen3MoeForCausalLM(MixtralForCausalLM):
@ -248,7 +248,7 @@ class Qwen3MoeForCausalLM(MixtralForCausalLM):
output_router_logits: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> MoeCausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -799,7 +799,7 @@ class SmolVLMCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -872,7 +872,7 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
cache_position: Optional[torch.LongTensor] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
r"""
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):

View File

@ -393,7 +393,7 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
)
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
@auto_docstring(
@ -484,7 +484,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, VideoLlavaCausalLMOutputWithPast]:
r"""
pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):