mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
update
This commit is contained in:
parent
a7e0ce238e
commit
c9bb39ef87
@ -805,7 +805,7 @@ class AriaTextModel(AriaTextPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -1198,7 +1198,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, AriaCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1329,7 +1329,7 @@ class AriaTextModel(LlamaModel):
|
||||
self.post_init()
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
|
||||
@ -1527,7 +1527,7 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
|
||||
return_dict: Optional[bool] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, AriaCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -339,7 +339,7 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -427,7 +427,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, AyaVisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -20,7 +20,7 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers.models.llava.modeling_llava import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlavaCausalLMOutputWithPast,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaModel,
|
||||
@ -279,7 +279,7 @@ class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, AyaVisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -691,7 +691,7 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -732,7 +732,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -517,7 +517,7 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -558,7 +558,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1249,7 +1249,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1320,7 +1320,7 @@ class Blip2Model(Blip2PreTrainedModel):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
):
|
||||
r"""
|
||||
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
@ -1508,7 +1508,7 @@ class Blip2Model(Blip2PreTrainedModel):
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Blip2ForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
@ -1981,7 +1981,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
use_cache: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Blip2ForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1177,7 +1177,7 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
||||
return causal_mask
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1236,7 +1236,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -463,7 +463,7 @@ class CohereModel(CoherePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -516,7 +516,7 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -292,7 +292,7 @@ class CohereModel(LlamaModel):
|
||||
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class CohereForCausalLM(LlamaForCausalLM):
|
||||
@ -315,7 +315,7 @@ class CohereForCausalLM(LlamaForCausalLM):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -488,7 +488,7 @@ class Cohere2Model(Cohere2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -541,7 +541,7 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -612,7 +612,7 @@ class CsmDepthDecoderForCausalLM(CsmPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
|
||||
@ -1015,7 +1015,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CsmOutputWithPast]:
|
||||
r"""
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
|
||||
|
@ -29,7 +29,7 @@ from ...processing_utils import Unpack
|
||||
from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
|
||||
from ..auto import AutoModel
|
||||
from ..llama.modeling_llama import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlamaAttention,
|
||||
LlamaDecoderLayer,
|
||||
LlamaForCausalLM,
|
||||
@ -370,7 +370,7 @@ class CsmDepthDecoderForCausalLM(LlamaForCausalLM, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
|
||||
@ -680,7 +680,7 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CsmOutputWithPast]:
|
||||
r"""
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
|
||||
|
@ -23,7 +23,7 @@ from ..deepseek_v3.modeling_deepseek_v3 import (
|
||||
DeepseekV3TopkRouter,
|
||||
)
|
||||
from ..qwen3.modeling_qwen3 import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
Qwen3Attention,
|
||||
Qwen3ForCausalLM,
|
||||
Qwen3Model,
|
||||
@ -77,7 +77,7 @@ class Dots1Model(Qwen3Model):
|
||||
class Dots1ForCausalLM(Qwen3ForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
**super_kwargs: Unpack[KwargsForCausalLM],
|
||||
**super_kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1579,7 +1579,7 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
|
||||
|
@ -31,7 +31,7 @@ from ...modeling_utils import PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import auto_docstring, can_return_tuple, logging
|
||||
from ..chameleon.modeling_chameleon import ChameleonPreTrainedModel, ChameleonVQVAEEncoderConvDownsample
|
||||
from ..llama.modeling_llama import KwargsForCausalLM, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel
|
||||
from ..llama.modeling_llama import TransformersKwargs, LlamaDecoderLayer, LlamaForCausalLM, LlamaModel
|
||||
from ..siglip.modeling_siglip import SiglipAttention
|
||||
from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
|
||||
|
||||
@ -1131,7 +1131,7 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
|
||||
|
@ -56,7 +56,7 @@ class FuyuPreTrainedModel(PreTrainedModel):
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
|
@ -279,7 +279,7 @@ class Glm4Attention(nn.Module):
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@use_kernel_forward_from_hub("RMSNorm")
|
||||
|
@ -100,13 +100,13 @@ class Glm4Attention(GlmAttention):
|
||||
pass
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Glm4ForCausalLM(GlmForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
**super_kwargs: Unpack[KwargsForCausalLM],
|
||||
**super_kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -756,7 +756,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -1181,7 +1181,7 @@ class Glm4vModel(Glm4vPreTrainedModel):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Glm4vModelOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
||||
@ -1407,7 +1407,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -911,7 +911,7 @@ class Glm4vTextDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast):
|
||||
@ -1246,7 +1246,7 @@ class Glm4vModel(Qwen2_5_VLModel):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Glm4vModelOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
||||
@ -1401,7 +1401,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Glm4vCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -691,7 +691,7 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -776,7 +776,7 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -21,7 +21,7 @@ import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from transformers.models.llava.modeling_llava import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlavaCausalLMOutputWithPast,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaModel,
|
||||
@ -400,7 +400,7 @@ class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -519,7 +519,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -563,7 +563,7 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -393,7 +393,7 @@ class GPTNeoXModel(LlamaModel, nn.Module):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -437,7 +437,7 @@ class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -490,7 +490,7 @@ class GraniteModel(GranitePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -541,7 +541,7 @@ class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -227,7 +227,7 @@ class GraniteModel(LlamaModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class GraniteForCausalLM(LlamaForCausalLM):
|
||||
@ -244,7 +244,7 @@ class GraniteForCausalLM(LlamaForCausalLM):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
|
@ -923,7 +923,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
||||
module.latents.data.normal_()
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -1424,7 +1424,7 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
|
||||
interpolate_pos_encoding: Optional[bool] = False,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, IdeficsCausalLMOutputWithPast]:
|
||||
r"""
|
||||
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
|
||||
|
@ -1106,7 +1106,7 @@ class Idefics2Model(Idefics2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1180,7 +1180,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Idefics2CausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
||||
|
@ -821,7 +821,7 @@ class Idefics3Model(Idefics3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -902,7 +902,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Idefics3CausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
||||
|
@ -1182,7 +1182,7 @@ class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1519,7 +1519,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
|
||||
return_dict: Optional[bool] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
use_cache: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, InstructBlipForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -840,7 +840,7 @@ class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
||||
return embeddings
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -1491,7 +1491,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
|
||||
return_dict: Optional[bool] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
use_cache: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -29,7 +29,7 @@ from transformers.models.instructblip.modeling_instructblip import (
|
||||
InstructBlipPreTrainedModel,
|
||||
InstructBlipQFormerModel,
|
||||
InstructBlipVisionModel,
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
)
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
@ -378,7 +378,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
|
||||
return_dict: Optional[bool] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
use_cache: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
```python
|
||||
|
@ -813,7 +813,7 @@ class InternVLCausalLMOutputWithPast(ModelOutput):
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -901,7 +901,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, InternVLCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1346,7 +1346,7 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1400,7 +1400,7 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
|
||||
@ -1765,7 +1765,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Kosmos2ForConditionalGenerationModelOutput]:
|
||||
r"""
|
||||
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -603,7 +603,7 @@ class Llama4TextModel(Llama4PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
|
||||
@ -656,7 +656,7 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
@ -1294,7 +1294,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: torch.Tensor = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Llama4CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -321,7 +321,7 @@ class LlavaModel(LlavaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -409,7 +409,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -518,7 +518,7 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -614,7 +614,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaNextCausalLMOutputWithPast]:
|
||||
r"""
|
||||
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
|
||||
|
@ -648,7 +648,7 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
|
||||
return video_features
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -746,7 +746,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaNextVideoCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):
|
||||
|
@ -20,7 +20,7 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers.models.llava_next.modeling_llava_next import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlavaNextCausalLMOutputWithPast,
|
||||
LlavaNextForConditionalGeneration,
|
||||
LlavaNextModel,
|
||||
@ -546,7 +546,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaNextVideoCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):
|
||||
|
@ -689,7 +689,7 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
|
||||
return image_features
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -790,7 +790,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
|
||||
|
@ -21,7 +21,7 @@ from torch import nn
|
||||
|
||||
from transformers.models.llava_next.image_processing_llava_next_fast import LlavaNextImageProcessorFast
|
||||
from transformers.models.llava_next_video.modeling_llava_next_video import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlavaNextVideoCausalLMOutputWithPast,
|
||||
LlavaNextVideoForConditionalGeneration,
|
||||
LlavaNextVideoModel,
|
||||
@ -638,7 +638,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, LlavaOnevisionCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
|
||||
|
@ -792,7 +792,7 @@ class MiniMaxModel(MiniMaxPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
@ -929,7 +929,7 @@ class MiniMaxForCausalLM(MiniMaxPreTrainedModel, GenerationMixin):
|
||||
output_router_logits: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> MoeCausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -357,7 +357,7 @@ class Mistral3Model(Mistral3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -443,7 +443,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Mistral3CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -23,7 +23,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import is_torchdynamo_compiling, logging
|
||||
from ..llava.modeling_llava import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
LlavaCausalLMOutputWithPast,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaModel,
|
||||
@ -284,7 +284,7 @@ class Mistral3ForConditionalGeneration(LlavaForConditionalGeneration):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
image_sizes: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Mistral3CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -575,7 +575,7 @@ class MixtralModel(MixtralPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
@ -712,7 +712,7 @@ class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin):
|
||||
output_router_logits: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> MoeCausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -422,7 +422,7 @@ class MixtralModel(MistralModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class MixtralForCausalLM(MistralForCausalLM):
|
||||
@ -449,7 +449,7 @@ class MixtralForCausalLM(MistralForCausalLM):
|
||||
output_router_logits: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> MoeCausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -1459,7 +1459,7 @@ class MllamaTextModel(MllamaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -1518,7 +1518,7 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
cross_attention_states (`torch.FloatTensor`, *optional*):
|
||||
@ -1833,7 +1833,7 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
aspect_ratio_mask (`torch.Tensor` of shape `(batch_size, max_num_images, max_num_tiles)`, *optional*):
|
||||
|
@ -776,7 +776,7 @@ class OPTModel(OPTPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
|
||||
@ -826,7 +826,7 @@ class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin):
|
||||
return_dict: Optional[bool] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, CausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -377,7 +377,7 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -452,7 +452,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, PaliGemmaCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -933,7 +933,7 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -1199,7 +1199,7 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
second_per_grid_ts: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
||||
@ -1428,7 +1428,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
second_per_grid_ts: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2_5_VLCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -29,7 +29,7 @@ import torch.utils.checkpoint
|
||||
|
||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
|
||||
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
||||
KwargsForCausalLM,
|
||||
TransformersKwargs,
|
||||
PatchEmbed,
|
||||
PatchMerger,
|
||||
Qwen2RMSNorm,
|
||||
@ -557,7 +557,7 @@ class Qwen2_5_VLModel(Qwen2VLModel):
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
second_per_grid_ts: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
||||
@ -710,7 +710,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
second_per_grid_ts: Optional[torch.Tensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2_5_VLCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -909,7 +909,7 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -1139,7 +1139,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2VLModelOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
||||
@ -1328,7 +1328,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
|
||||
video_grid_thw: Optional[torch.LongTensor] = None,
|
||||
rope_deltas: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, Qwen2VLCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -481,7 +481,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring
|
||||
|
@ -116,13 +116,13 @@ class Qwen3Model(Qwen2Model):
|
||||
pass
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Qwen3ForCausalLM(Qwen2ForCausalLM):
|
||||
def forward(
|
||||
self,
|
||||
**super_kwargs: Unpack[KwargsForCausalLM],
|
||||
**super_kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -582,7 +582,7 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
def load_balancing_loss_func(
|
||||
@ -719,7 +719,7 @@ class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin):
|
||||
output_router_logits: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> MoeCausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -225,7 +225,7 @@ class Qwen3MoeModel(MixtralModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
class Qwen3MoeForCausalLM(MixtralForCausalLM):
|
||||
@ -248,7 +248,7 @@ class Qwen3MoeForCausalLM(MixtralForCausalLM):
|
||||
output_router_logits: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> MoeCausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
|
@ -799,7 +799,7 @@ class SmolVLMCausalLMOutputWithPast(ModelOutput):
|
||||
image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -872,7 +872,7 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
||||
|
@ -393,7 +393,7 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
class TransformersKwargs(FlashAttentionKwargs, TransformersKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -484,7 +484,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> Union[tuple, VideoLlavaCausalLMOutputWithPast]:
|
||||
r"""
|
||||
pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
|
||||
|
Loading…
Reference in New Issue
Block a user