mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
resolve conflicts for seamless_m4tv2
This commit is contained in:
parent
e0f56e4716
commit
ab93442a52
@ -1,9 +1,3 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/seamless_m4t/modular_seamless_m4t.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_seamless_m4t.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
@ -18,13 +12,15 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch SeamlessM4T model."""
|
||||
|
||||
import copy
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union, tuple
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import Tensor, nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
@ -32,7 +28,10 @@ from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...integrations.fsdp import is_fsdp_managed_module
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_causal_attention_mask,
|
||||
)
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -48,6 +47,37 @@ from .configuration_seamless_m4t import SeamlessM4TConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SEAMLESS_M4T_COMMON_CUSTOM_ARGS = r"""
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are decoder input IDs?](../glossary#decoder-input-ids)
|
||||
|
||||
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
|
||||
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
|
||||
|
||||
For translation and summarization training, `decoder_input_ids` should be provided. If no
|
||||
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
|
||||
for denoising pre-training following the paper.
|
||||
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
|
||||
be used by default.
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
@ -58,22 +88,18 @@ logger = logging.get_logger(__name__)
|
||||
)
|
||||
class SeamlessM4TGenerationOutput(ModelOutput):
|
||||
r"""
|
||||
Class defining the generated outputs from [`SeamlessM4TModel`], [`SeamlessM4TForTextToText`],
|
||||
[`SeamlessM4TForTextToSpeech`], [`SeamlessM4TForSpeechToSpeech`] and [`SeamlessM4TForTextToSpeech`].
|
||||
|
||||
Args:
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
The final audio waveform predicted by the model.
|
||||
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
|
||||
The length in samples of each element in the `waveform` batch.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
|
||||
The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
|
||||
early due to the `eos_token_id`.
|
||||
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
|
||||
The generated translated unit sequences. This is the output of the text-to-units model. The second
|
||||
dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
|
||||
early due to the `t2u_eos_token_id`.
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
The final audio waveform predicted by the model.
|
||||
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
|
||||
The length in samples of each element in the `waveform` batch.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
|
||||
The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
|
||||
early due to the `eos_token_id`.
|
||||
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
|
||||
The generated translated unit sequences. This is the output of the text-to-units model. The second
|
||||
dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
|
||||
early due to the `t2u_eos_token_id`.
|
||||
"""
|
||||
|
||||
waveform: Optional[torch.FloatTensor] = None
|
||||
@ -82,17 +108,113 @@ class SeamlessM4TGenerationOutput(ModelOutput):
|
||||
unit_sequences: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
class SeamlessM4TConformerSamePadLayer(nn.Module):
|
||||
def __init__(self, num_conv_pos_embeddings):
|
||||
super().__init__()
|
||||
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
|
||||
|
||||
def forward(self, hidden_states):
|
||||
if self.num_pad_remove > 0:
|
||||
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
|
||||
return hidden_states
|
||||
############ UTILS ################
|
||||
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
|
||||
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
||||
"""
|
||||
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
||||
are ignored. This is modified from fairseq's `utils.make_positions`.
|
||||
|
||||
Args:
|
||||
x: torch.Tensor x:
|
||||
|
||||
Returns: torch.Tensor
|
||||
"""
|
||||
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
||||
mask = input_ids.ne(padding_idx).int()
|
||||
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
||||
return incremental_indices.long() + padding_idx
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
||||
"""
|
||||
Shift input ids one token to the right.
|
||||
"""
|
||||
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
||||
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
||||
shifted_input_ids[:, 0] = decoder_start_token_id
|
||||
|
||||
if pad_token_id is None:
|
||||
raise ValueError("self.model.config.pad_token_id has to be defined.")
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
|
||||
def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
|
||||
"""
|
||||
Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
|
||||
stops at the corresponding element in `seq_lens`.
|
||||
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
|
||||
The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
|
||||
seq_lens (`torch.Tensor` of shape `(batch)`:
|
||||
Each element represents the length of the sequence at the same index in `hidden_states`
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
|
||||
"""
|
||||
batch_size, mask_seq_len = hidden_states.shape[:2]
|
||||
|
||||
indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
|
||||
|
||||
bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
|
||||
|
||||
mask = hidden_states.new_ones((batch_size, mask_seq_len))
|
||||
|
||||
mask = mask.masked_fill(bool_mask, 0)
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def format_speech_generation_kwargs(kwargs):
|
||||
"""
|
||||
Format kwargs for SeamlessM4T models that generate speech, attribute kwargs to either the text generation or the
|
||||
speech generation models.
|
||||
|
||||
Args:
|
||||
kwargs (`dict`)`:
|
||||
Keyword arguments are of two types:
|
||||
|
||||
- Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
|
||||
except for `decoder_input_ids` which will only be passed through the text components.
|
||||
- With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
|
||||
text model and speech model respectively. It has the priority over the keywords without a prefix.
|
||||
|
||||
This means you can, for example, specify a generation strategy for one generation but not for the
|
||||
other.
|
||||
"""
|
||||
# attribute kwargs to models
|
||||
kwargs_text = {}
|
||||
kwargs_speech = {}
|
||||
for key, value in kwargs.items():
|
||||
if key.startswith("text_"):
|
||||
key = key[len("text_") :]
|
||||
kwargs_text[key] = value
|
||||
elif key.startswith("speech_"):
|
||||
key = key[len("speech_") :]
|
||||
kwargs_speech[key] = value
|
||||
elif key == "generation_config":
|
||||
kwargs_text[key] = value
|
||||
else:
|
||||
# If the key is already in a specific config, then it's been set with a
|
||||
# submodules specific value and we don't override
|
||||
if key not in kwargs_text:
|
||||
kwargs_text[key] = value
|
||||
if key not in kwargs_speech:
|
||||
kwargs_speech[key] = value
|
||||
return kwargs_text, kwargs_speech
|
||||
|
||||
|
||||
############ SPEECH ENCODER related code ################
|
||||
|
||||
|
||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->SeamlessM4TConformer, feat_extract_activation->speech_encoder_hidden_act
|
||||
class SeamlessM4TConformerPositionalConvEmbedding(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@ -138,6 +260,7 @@ class SeamlessM4TConformerPositionalConvEmbedding(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2->SeamlessM4T, num_attention_heads->speech_encoder_attention_heads
|
||||
class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
|
||||
"""Rotary positional embedding
|
||||
Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864
|
||||
@ -147,6 +270,7 @@ class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
|
||||
super().__init__()
|
||||
dim = config.hidden_size // config.speech_encoder_attention_heads
|
||||
base = config.rotary_embedding_base
|
||||
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
|
||||
self.register_buffer("inv_freq", inv_freq)
|
||||
self.cached_sequence_length = None
|
||||
@ -171,6 +295,7 @@ class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
|
||||
return self.cached_rotary_positional_embedding
|
||||
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding with Wav2Vec2->SeamlessM4T
|
||||
class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
|
||||
"""Relative positional encoding module."""
|
||||
|
||||
@ -221,6 +346,18 @@ class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
|
||||
return relative_position_embeddings
|
||||
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSamePadLayer with Wav2Vec2->SeamlessM4T
|
||||
class SeamlessM4TConformerSamePadLayer(nn.Module):
|
||||
def __init__(self, num_conv_pos_embeddings):
|
||||
super().__init__()
|
||||
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
|
||||
|
||||
def forward(self, hidden_states):
|
||||
if self.num_pad_remove > 0:
|
||||
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SeamlessM4TConformerFeatureProjection(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@ -326,7 +463,7 @@ class SeamlessM4TConformerConvolutionModule(nn.Module):
|
||||
|
||||
|
||||
class SeamlessM4TConformerSelfAttention(nn.Module):
|
||||
"""Construct an SeamlessM4TConformerSelfAttention object.
|
||||
"""Construct a SeamlessM4TConformerSelfAttention object.
|
||||
Can be enhanced with rotary or relative position embeddings.
|
||||
"""
|
||||
|
||||
@ -352,6 +489,7 @@ class SeamlessM4TConformerSelfAttention(nn.Module):
|
||||
self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
|
||||
self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention.forward
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@ -414,6 +552,7 @@ class SeamlessM4TConformerSelfAttention(nn.Module):
|
||||
|
||||
return hidden_states, probs
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_rotary_embedding
|
||||
def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
|
||||
batch_size, sequence_length, hidden_size = hidden_states.size()
|
||||
hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
|
||||
@ -433,6 +572,7 @@ class SeamlessM4TConformerSelfAttention(nn.Module):
|
||||
|
||||
return hidden_states
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_relative_embeddings
|
||||
def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
|
||||
# 1. project positional embeddings
|
||||
# => (batch, head, 2*time1-1, d_k)
|
||||
@ -476,6 +616,7 @@ class SeamlessM4TConformerSelfAttention(nn.Module):
|
||||
class SeamlessM4TConformerEncoderLayer(GradientCheckpointingLayer):
|
||||
"""Conformer block based on https://huggingface.co/papers/2005.08100."""
|
||||
|
||||
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4T, attention_dropout->speech_encoder_dropout, torch.nn->nn
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
embed_dim = config.hidden_size
|
||||
@ -633,33 +774,6 @@ class SeamlessM4TConformerEncoder(nn.Module):
|
||||
)
|
||||
|
||||
|
||||
def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
|
||||
"""
|
||||
Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
|
||||
stops at the corresponding element in `seq_lens`.
|
||||
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
|
||||
The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
|
||||
seq_lens (`torch.Tensor` of shape `(batch)`:
|
||||
Each element represents the length of the sequence at the same index in `hidden_states`
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
|
||||
"""
|
||||
batch_size, mask_seq_len = hidden_states.shape[:2]
|
||||
|
||||
indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
|
||||
|
||||
bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
|
||||
|
||||
mask = hidden_states.new_ones((batch_size, mask_seq_len))
|
||||
|
||||
mask = mask.masked_fill(bool_mask, 0)
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
class SeamlessM4TConformerAdapterLayer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@ -773,6 +887,10 @@ class SeamlessM4TConformerAdapter(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
############ TEXT / UNITS related code ################
|
||||
|
||||
|
||||
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4T
|
||||
class SeamlessM4TScaledWordEmbedding(nn.Embedding):
|
||||
"""
|
||||
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
|
||||
@ -786,17 +904,7 @@ class SeamlessM4TScaledWordEmbedding(nn.Embedding):
|
||||
return super().forward(input_ids) * self.embed_scale
|
||||
|
||||
|
||||
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
||||
"""
|
||||
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
||||
are ignored. This is modified from fairseq's `utils.make_positions`.
|
||||
"""
|
||||
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
||||
mask = input_ids.ne(padding_idx).int()
|
||||
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
||||
return incremental_indices.long() + padding_idx
|
||||
|
||||
|
||||
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
|
||||
class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
|
||||
"""This module produces sinusoidal positional embeddings of any length."""
|
||||
|
||||
@ -881,6 +989,7 @@ class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
|
||||
class SeamlessM4TAttention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4T
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int,
|
||||
@ -920,6 +1029,9 @@ class SeamlessM4TAttention(nn.Module):
|
||||
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
||||
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
||||
|
||||
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
||||
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@ -966,10 +1078,10 @@ class SeamlessM4TAttention(nn.Module):
|
||||
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
||||
|
||||
if self.is_decoder:
|
||||
# if cross_attention save tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# Further calls to cross_attention layer can then reuse all cross-attention
|
||||
# key/value_states (first "if" case)
|
||||
# if uni-directional self-attention (decoder) save tuple(torch.Tensor, torch.Tensor) of
|
||||
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
|
||||
# all previous decoder key/value_states. Further calls to uni-directional self-attention
|
||||
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
|
||||
# if encoder bi-directional self-attention `past_key_value` is always `None`
|
||||
@ -1030,10 +1142,8 @@ class SeamlessM4TAttention(nn.Module):
|
||||
|
||||
return attn_output, attn_weights_reshaped, past_key_value
|
||||
|
||||
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
||||
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
||||
|
||||
|
||||
# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4T,DenseActDense->FeedForwardNetwork, d_model->hidden_size
|
||||
class SeamlessM4TFeedForwardNetwork(nn.Module):
|
||||
def __init__(self, config: SeamlessM4TConfig, ffn_dim: int):
|
||||
super().__init__()
|
||||
@ -1171,7 +1281,7 @@ class SeamlessM4TDecoderLayer(GradientCheckpointingLayer):
|
||||
encoder_attention_mask (`torch.FloatTensor`):
|
||||
encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
|
||||
very large negative values.
|
||||
past_key_value (`tuple(torch.FloatTensor)`):
|
||||
past_key_value (`Tuple(torch.FloatTensor)`):
|
||||
cached past key and value projection states
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||
@ -1874,54 +1984,6 @@ class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
||||
"""
|
||||
Shift input ids one token to the right.
|
||||
"""
|
||||
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
||||
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
||||
shifted_input_ids[:, 0] = decoder_start_token_id
|
||||
|
||||
if pad_token_id is None:
|
||||
raise ValueError("self.model.config.pad_token_id has to be defined.")
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
|
||||
SEAMLESS_M4T_COMMON_CUSTOM_ARGS = r"""
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are decoder input IDs?](../glossary#decoder-input-ids)
|
||||
|
||||
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
|
||||
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
|
||||
|
||||
For translation and summarization training, `decoder_input_ids` should be provided. If no
|
||||
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
|
||||
for denoising pre-training following the paper.
|
||||
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
|
||||
be used by default.
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
"""
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4TTextToUnit`].
|
||||
@ -1945,7 +2007,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
|
||||
embed_tokens_decoder (`nn.Embedding`, *optional*):
|
||||
input embedding of the decoder.
|
||||
"""
|
||||
# update config - used principaly for bos_token_id etc.
|
||||
# update config - used principality for bos_token_id etc.
|
||||
config = copy.deepcopy(config)
|
||||
for param, val in config.to_dict().items():
|
||||
if param.startswith("t2u_"):
|
||||
@ -2069,6 +2131,10 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
|
||||
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
|
||||
|
||||
|
||||
############ VOCODER related code ################
|
||||
|
||||
|
||||
# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
|
||||
class HifiGanResidualBlock(nn.Module):
|
||||
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
|
||||
super().__init__()
|
||||
@ -2402,6 +2468,9 @@ class SeamlessM4TCodeHifiGan(PreTrainedModel):
|
||||
nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
|
||||
|
||||
|
||||
############ WHOLE MODEL related code ################
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The text-to-text SeamlessM4T Model transformer which can be used for T2TT.
|
||||
@ -2952,45 +3021,6 @@ class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
|
||||
return reordered_past
|
||||
|
||||
|
||||
def format_speech_generation_kwargs(kwargs):
|
||||
"""
|
||||
Format kwargs for SeamlessM4T models that generate speech, attribute kwargs to either the text generation or the
|
||||
speech generation models.
|
||||
|
||||
Args:
|
||||
kwargs (`dict`)`:
|
||||
Keyword arguments are of two types:
|
||||
|
||||
- Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
|
||||
except for `decoder_input_ids` which will only be passed through the text components.
|
||||
- With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
|
||||
text model and speech model respectively. It has the priority over the keywords without a prefix.
|
||||
|
||||
This means you can, for example, specify a generation strategy for one generation but not for the
|
||||
other.
|
||||
"""
|
||||
# attribute kwargs to models
|
||||
kwargs_text = {}
|
||||
kwargs_speech = {}
|
||||
for key, value in kwargs.items():
|
||||
if key.startswith("text_"):
|
||||
key = key[len("text_") :]
|
||||
kwargs_text[key] = value
|
||||
elif key.startswith("speech_"):
|
||||
key = key[len("speech_") :]
|
||||
kwargs_speech[key] = value
|
||||
elif key == "generation_config":
|
||||
kwargs_text[key] = value
|
||||
else:
|
||||
# If the key is already in a specific config, then it's been set with a
|
||||
# submodules specific value and we don't override
|
||||
if key not in kwargs_text:
|
||||
kwargs_text[key] = value
|
||||
if key not in kwargs_speech:
|
||||
kwargs_speech[key] = value
|
||||
return kwargs_text, kwargs_speech
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.
|
||||
|
@ -22,7 +22,7 @@
|
||||
import copy
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Optional, Union, tuple
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
@ -33,6 +33,7 @@ from ...generation import GenerationMixin
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...integrations.fsdp import is_fsdp_managed_module
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -49,112 +50,81 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeamlessM4Tv2GenerationOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`],
|
||||
[`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`].
|
||||
|
||||
Args:
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
The final audio waveform predicted by the model.
|
||||
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
|
||||
The length in samples of each element in the `waveform` batch.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
|
||||
The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
|
||||
early due to the `eos_token_id`.
|
||||
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
|
||||
The generated translated unit sequences. This is the output of the text-to-units model. The second
|
||||
dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
|
||||
early due to the `t2u_eos_token_id`.
|
||||
"""
|
||||
)
|
||||
class SeamlessM4Tv2GenerationOutput(ModelOutput):
|
||||
r"""
|
||||
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
The final audio waveform predicted by the model.
|
||||
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
|
||||
The length in samples of each element in the `waveform` batch.
|
||||
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
|
||||
The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
|
||||
early due to the `eos_token_id`.
|
||||
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
|
||||
The generated translated unit sequences. This is the output of the text-to-units model. The second
|
||||
dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
|
||||
early due to the `t2u_eos_token_id`.
|
||||
"""
|
||||
|
||||
waveform: Optional[torch.FloatTensor] = None
|
||||
waveform_lengths: Optional[torch.IntTensor] = None
|
||||
sequences: Optional[Tuple[torch.FloatTensor]] = None
|
||||
unit_sequences: Optional[Tuple[torch.FloatTensor]] = None
|
||||
sequences: Optional[tuple[torch.FloatTensor]] = None
|
||||
unit_sequences: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
|
||||
"""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
|
||||
[`SeamlessM4Tv2TextToUnitModel`].
|
||||
"""
|
||||
)
|
||||
class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
padding_mask: Optional[torch.Tensor] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
"""
|
||||
)
|
||||
class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
padding_mask: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
@ -296,7 +266,7 @@ class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
# self-attention mechanism
|
||||
batch_size, sequence_length, hidden_size = hidden_states.size()
|
||||
|
||||
@ -351,8 +321,8 @@ class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class SeamlessM4Tv2ConformerEncoderLayer(nn.Module):
|
||||
"""Conformer block based on https://arxiv.org/abs/2005.08100."""
|
||||
class SeamlessM4Tv2ConformerEncoderLayer(GradientCheckpointingLayer):
|
||||
"""Conformer block based on https://huggingface.co/papers/2005.08100."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@ -423,7 +393,7 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
self.config = config
|
||||
|
||||
self.dropout = nn.Dropout(config.speech_encoder_dropout)
|
||||
self.layers = nn.ModuleList(
|
||||
self.layers = nn.Modulelist(
|
||||
[SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
|
||||
)
|
||||
|
||||
@ -498,7 +468,7 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
dropout_probability = torch.rand([])
|
||||
|
||||
skip_the_layer = (
|
||||
@ -506,21 +476,12 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
)
|
||||
if not skip_the_layer or synced_gpus:
|
||||
# under fsdp or deepspeed zero3 all gpus must run in sync
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions,
|
||||
conv_attention_mask,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
conv_attention_mask=conv_attention_mask,
|
||||
)
|
||||
layer_outputs = layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
conv_attention_mask=conv_attention_mask,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if skip_the_layer:
|
||||
@ -835,10 +796,10 @@ class SeamlessM4Tv2Attention(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
past_key_value: Optional[tuple[torch.Tensor]] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
is_cross_attention = encoder_hidden_states is not None
|
||||
@ -864,10 +825,10 @@ class SeamlessM4Tv2Attention(nn.Module):
|
||||
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
|
||||
|
||||
if self.is_decoder:
|
||||
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# if cross_attention save tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# Further calls to cross_attention layer can then reuse all cross-attention
|
||||
# key/value_states (first "if" case)
|
||||
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
|
||||
# if uni-directional self-attention (decoder) save tuple(torch.Tensor, torch.Tensor) of
|
||||
# all previous decoder key/value_states. Further calls to uni-directional self-attention
|
||||
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
|
||||
# if encoder bi-directional self-attention `past_key_value` is always `None`
|
||||
@ -920,7 +881,7 @@ class SeamlessM4Tv2FeedForwardNetwork(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class SeamlessM4Tv2EncoderLayer(nn.Module):
|
||||
class SeamlessM4Tv2EncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None):
|
||||
super().__init__()
|
||||
encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
|
||||
@ -983,7 +944,7 @@ class SeamlessM4Tv2EncoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class SeamlessM4Tv2DecoderLayer(nn.Module):
|
||||
class SeamlessM4Tv2DecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
|
||||
super().__init__()
|
||||
decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
|
||||
@ -1019,7 +980,7 @@ class SeamlessM4Tv2DecoderLayer(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
past_key_value: Optional[tuple[torch.Tensor]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = True,
|
||||
) -> torch.Tensor:
|
||||
@ -1098,7 +1059,7 @@ class SeamlessM4Tv2DecoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class SeamlessM4Tv2TextToUnitDecoderLayer(nn.Module):
|
||||
class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
|
||||
super().__init__()
|
||||
decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
|
||||
@ -1269,7 +1230,7 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
|
||||
Args:
|
||||
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
|
||||
subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
|
||||
Corresponding text string for each input id.
|
||||
merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
|
||||
Indicates if the space character is merged with the previous subword. If `False`, it will be merged
|
||||
@ -1345,7 +1306,7 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
|
||||
Args:
|
||||
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
|
||||
subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
|
||||
Corresponding text string for each input id.
|
||||
char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Number of characters per input id.
|
||||
@ -1443,7 +1404,7 @@ class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
|
||||
) -> Union[tuple, Wav2Vec2BaseModelOutput]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -1558,7 +1519,7 @@ class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Tuple, BaseModelOutput]:
|
||||
) -> Union[tuple, BaseModelOutput]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
@ -1635,7 +1596,7 @@ class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
to_drop = False
|
||||
if self.training:
|
||||
dropout_probability = torch.rand([])
|
||||
@ -1645,19 +1606,11 @@ class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.forward,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1745,13 +1698,13 @@ class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -1810,7 +1763,7 @@ class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
|
||||
next_decoder_cache = () if use_cache else None
|
||||
|
||||
for idx, decoder_layer in enumerate(self.layers):
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
if self.training:
|
||||
@ -1820,27 +1773,15 @@ class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
|
||||
|
||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
@ -1935,7 +1876,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
decoder_ffn_dim=config.decoder_ffn_dim,
|
||||
)
|
||||
)
|
||||
self.layers = nn.ModuleList(layers)
|
||||
self.layers = nn.Modulelist(layers)
|
||||
self.layer_norm = nn.LayerNorm(config.hidden_size)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
@ -1956,7 +1897,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
|
||||
) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
|
||||
r"""
|
||||
Args:
|
||||
char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
|
||||
@ -1994,7 +1935,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
|
||||
# predict duration
|
||||
log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
|
||||
dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
|
||||
dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
|
||||
dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
|
||||
|
||||
# upsample char hidden states according to predicted duration
|
||||
@ -2013,7 +1954,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for idx, decoder_layer in enumerate(self.layers):
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
if self.training:
|
||||
@ -2086,12 +2027,12 @@ class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
|
||||
char_input_ids: Optional[torch.LongTensor] = None,
|
||||
char_count_per_id: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
|
||||
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -2162,7 +2103,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
|
||||
embed_tokens_decoder (`nn.Embedding`, *optional*):
|
||||
input embedding of the decoder.
|
||||
"""
|
||||
# update config - used principaly for bos_token_id etc.
|
||||
# update config - used principality for bos_token_id etc.
|
||||
config = copy.deepcopy(config)
|
||||
for param, val in config.to_dict().items():
|
||||
if param.startswith("t2u_"):
|
||||
@ -2201,14 +2142,14 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
|
||||
char_input_ids: Optional[torch.LongTensor] = None,
|
||||
char_count_per_id: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
|
||||
Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
|
||||
@ -2524,7 +2465,7 @@ class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
|
||||
|
||||
def forward(
|
||||
self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
@ -2603,45 +2544,10 @@ class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
|
||||
nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
|
||||
|
||||
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
||||
"""
|
||||
Shift input ids one token to the right.
|
||||
"""
|
||||
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
||||
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
||||
shifted_input_ids[:, 0] = decoder_start_token_id
|
||||
|
||||
if pad_token_id is None:
|
||||
raise ValueError("self.model.config.pad_token_id has to be defined.")
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
|
||||
############ WHOLE MODEL related code ################
|
||||
|
||||
|
||||
SEAMLESS_M4T_V2_T2T_START_DOCSTRING = r"""
|
||||
Generates sequences of token ids.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
||||
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
||||
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
||||
|
||||
For an overview of generation strategies and code examples, check out the [following
|
||||
guide](./generation_strategies).
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
|
||||
|
||||
SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING = r"""
|
||||
SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS = r"""
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
Input audio features. This should be returned by the [`SeamlessM4Tv2FeatureExtractor`] class or the
|
||||
[`SeamlessM4Tv2Processor`] class. See [`SeamlessM4Tv2FeatureExtractor.__call__`] for details.
|
||||
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
@ -2661,7 +2567,7 @@ SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING = r"""
|
||||
be used by default.
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
@ -2669,46 +2575,24 @@ SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING = r"""
|
||||
model's internal embedding lookup matrix.
|
||||
"""
|
||||
|
||||
SEAMLESS_M4T_V2_T2T_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
||||
"""
|
||||
Shift input ids one token to the right.
|
||||
"""
|
||||
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
||||
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
||||
shifted_input_ids[:, 0] = decoder_start_token_id
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
tgt_lang (`str`, *optional*):
|
||||
The language to use as target language for translation.
|
||||
generation_config (`~generation.GenerationConfig`, *optional*):
|
||||
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
|
||||
passed to generate matching the attributes of `generation_config` will override them. If
|
||||
`generation_config` is not provided, the default will be used, which had the following loading
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
|
||||
for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
|
||||
Retrieval](https://arxiv.org/abs/2010.00904).
|
||||
synced_gpus (`bool`, *optional*, defaults to `False`):
|
||||
Whether to continue running the while loop until max_length (needed to avoid deadlocking with
|
||||
`FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model.
|
||||
"""
|
||||
if pad_token_id is None:
|
||||
raise ValueError("self.model.config.pad_token_id has to be defined.")
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
|
||||
############ WHOLE MODEL related code ################
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
@ -2764,15 +2648,15 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
|
||||
self._tie_or_clone_weights(self.lm_head, self.shared)
|
||||
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING)
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
@ -2781,7 +2665,7 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
@ -2862,7 +2746,6 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
encoder_attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
@auto_docstring(custom_intro=SEAMLESS_M4T_V2_T2T_START_DOCSTRING, custom_args=SEAMLESS_M4T_V2_T2T_INPUTS_DOCSTRING)
|
||||
def generate(
|
||||
self,
|
||||
input_ids=None,
|
||||
@ -2875,6 +2758,58 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Generates sequences of token ids.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
||||
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
||||
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
||||
|
||||
For an overview of generation strategies and code examples, check out the [following
|
||||
guide](./generation_strategies).
|
||||
|
||||
</Tip>
|
||||
|
||||
Parameters:
|
||||
input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`SeamlessM4Tv2Tokenizer`] or [`SeamlessM4Tv2Processor`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
tgt_lang (`str`, *optional*):
|
||||
The language to use as target language for translation.
|
||||
generation_config (`~generation.GenerationConfig`, *optional*):
|
||||
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
|
||||
passed to generate matching the attributes of `generation_config` will override them. If
|
||||
`generation_config` is not provided, the default will be used, which had the following loading
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
|
||||
for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
|
||||
Retrieval](https://huggingface.co/papers/2010.00904).
|
||||
synced_gpus (`bool`, *optional*, defaults to `False`):
|
||||
Whether to continue running the while loop until max_length (needed to avoid deadlocking with
|
||||
`FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model.
|
||||
|
||||
Return:
|
||||
[`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
|
||||
or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
|
||||
@ -2933,61 +2868,6 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
return reordered_past
|
||||
|
||||
|
||||
SEAMLESS_M4T_V2_S2T_START_DOCSTRING = r"""
|
||||
Generates sequences of token ids.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
||||
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
||||
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
||||
|
||||
For an overview of generation strategies and code examples, check out the [following
|
||||
guide](./generation_strategies).
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
|
||||
|
||||
SEAMLESS_M4T_V2_S2T_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
|
||||
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
|
||||
|
||||
tgt_lang (`str`, *optional*):
|
||||
The language to use as target language for translation.
|
||||
generation_config (`~generation.GenerationConfig`, *optional*):
|
||||
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
|
||||
passed to generate matching the attributes of `generation_config` will override them. If
|
||||
`generation_config` is not provided, the default will be used, which had the following loading
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
|
||||
for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
|
||||
Retrieval](https://arxiv.org/abs/2010.00904).
|
||||
synced_gpus (`bool`, *optional*, defaults to `False`):
|
||||
Whether to continue running the while loop until max_length (needed to avoid deadlocking with
|
||||
`FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model.
|
||||
"""
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.
|
||||
@ -3036,15 +2916,15 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
|
||||
self._tie_or_clone_weights(self.lm_head, self.shared)
|
||||
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING)
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
|
||||
def forward(
|
||||
self,
|
||||
input_features: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
@ -3053,7 +2933,7 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
@ -3141,7 +3021,6 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
encoder_attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
@auto_docstring(custom_intro=SEAMLESS_M4T_V2_S2T_START_DOCSTRING, custom_args=SEAMLESS_M4T_V2_S2T_INPUTS_DOCSTRING)
|
||||
def generate(
|
||||
self,
|
||||
input_features=None,
|
||||
@ -3154,6 +3033,55 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Generates sequences of token ids.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
||||
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
||||
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
||||
|
||||
For an overview of generation strategies and code examples, check out the [following
|
||||
guide](./generation_strategies).
|
||||
|
||||
</Tip>
|
||||
|
||||
Parameters:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
|
||||
Input audio features. This should be returned by the [`SeamlessM4Tv2FeatureExtractor`] class or the
|
||||
[`SeamlessM4Tv2Processor`] class. See [`SeamlessM4Tv2FeatureExtractor.__call__`] for details.
|
||||
|
||||
tgt_lang (`str`, *optional*):
|
||||
The language to use as target language for translation.
|
||||
generation_config (`~generation.GenerationConfig`, *optional*):
|
||||
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
|
||||
passed to generate matching the attributes of `generation_config` will override them. If
|
||||
`generation_config` is not provided, the default will be used, which had the following loading
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
|
||||
for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
|
||||
Retrieval](https://huggingface.co/papers/2010.00904).
|
||||
synced_gpus (`bool`, *optional*, defaults to `False`):
|
||||
Whether to continue running the while loop until max_length (needed to avoid deadlocking with
|
||||
`FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model.
|
||||
|
||||
Return:
|
||||
[`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
|
||||
or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
|
||||
@ -3312,15 +3240,15 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
|
||||
self._tie_or_clone_weights(self.lm_head, self.shared)
|
||||
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING)
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
@ -3328,7 +3256,7 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
@ -3470,7 +3398,7 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
||||
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
|
||||
- If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
|
||||
sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
@ -3675,15 +3603,15 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
||||
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
|
||||
self._tie_or_clone_weights(self.lm_head, self.shared)
|
||||
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING)
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
|
||||
def forward(
|
||||
self,
|
||||
input_features: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
@ -3692,7 +3620,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
@ -3838,7 +3766,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
|
||||
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
|
||||
- If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
|
||||
sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
@ -4074,7 +4002,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
|
||||
self._tie_or_clone_weights(self.lm_head, self.shared)
|
||||
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS_DOCSTRING)
|
||||
@auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@ -4082,8 +4010,8 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.LongTensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
@ -4092,7 +4020,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
@ -4286,7 +4214,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
|
||||
other.
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor], ModelOutput]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor], ModelOutput]`:
|
||||
- If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
|
||||
shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Optional, Union, tuple
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
@ -26,6 +26,7 @@ from ...activations import ACT2FN
|
||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from ...integrations.fsdp import is_fsdp_managed_module
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
Seq2SeqLMOutput,
|
||||
@ -80,85 +81,52 @@ class SeamlessM4Tv2GenerationOutput(SeamlessM4TGenerationOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
|
||||
"""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
|
||||
[`SeamlessM4Tv2TextToUnitModel`].
|
||||
"""
|
||||
)
|
||||
class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
padding_mask: Optional[torch.Tensor] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
"""
|
||||
)
|
||||
class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
|
||||
for *masked*
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
padding_mask: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
@ -305,7 +273,7 @@ class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
# self-attention mechanism
|
||||
batch_size, sequence_length, hidden_size = hidden_states.size()
|
||||
|
||||
@ -361,7 +329,7 @@ class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
|
||||
|
||||
|
||||
class SeamlessM4Tv2ConformerEncoderLayer(Wav2Vec2ConformerEncoderLayer):
|
||||
"""Conformer block based on https://arxiv.org/abs/2005.08100."""
|
||||
"""Conformer block based on https://huggingface.co/papers/2005.08100."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -420,7 +388,7 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
self.config = config
|
||||
|
||||
self.dropout = nn.Dropout(config.speech_encoder_dropout)
|
||||
self.layers = nn.ModuleList(
|
||||
self.layers = nn.Modulelist(
|
||||
[SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
|
||||
)
|
||||
|
||||
@ -495,7 +463,7 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
dropout_probability = torch.rand([])
|
||||
|
||||
skip_the_layer = (
|
||||
@ -503,21 +471,12 @@ class SeamlessM4Tv2ConformerEncoder(nn.Module):
|
||||
)
|
||||
if not skip_the_layer or synced_gpus:
|
||||
# under fsdp or deepspeed zero3 all gpus must run in sync
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions,
|
||||
conv_attention_mask,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
conv_attention_mask=conv_attention_mask,
|
||||
)
|
||||
layer_outputs = layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
conv_attention_mask=conv_attention_mask,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if skip_the_layer:
|
||||
@ -571,10 +530,10 @@ class SeamlessM4Tv2Attention(BartAttention):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
past_key_value: Optional[tuple[torch.Tensor]] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
is_cross_attention = encoder_hidden_states is not None
|
||||
@ -600,10 +559,10 @@ class SeamlessM4Tv2Attention(BartAttention):
|
||||
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
|
||||
|
||||
if self.is_decoder:
|
||||
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# if cross_attention save tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
|
||||
# Further calls to cross_attention layer can then reuse all cross-attention
|
||||
# key/value_states (first "if" case)
|
||||
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
|
||||
# if uni-directional self-attention (decoder) save tuple(torch.Tensor, torch.Tensor) of
|
||||
# all previous decoder key/value_states. Further calls to uni-directional self-attention
|
||||
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
|
||||
# if encoder bi-directional self-attention `past_key_value` is always `None`
|
||||
@ -643,7 +602,7 @@ class SeamlessM4Tv2DecoderLayer(SeamlessM4TDecoderLayer):
|
||||
pass
|
||||
|
||||
|
||||
class SeamlessM4Tv2TextToUnitDecoderLayer(nn.Module):
|
||||
class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
|
||||
super().__init__()
|
||||
decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
|
||||
@ -808,7 +767,7 @@ class SeamlessM4Tv2PreTrainedModel(SeamlessM4TPreTrainedModel):
|
||||
Args:
|
||||
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
|
||||
subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
|
||||
Corresponding text string for each input id.
|
||||
merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
|
||||
Indicates if the space character is merged with the previous subword. If `False`, it will be merged
|
||||
@ -884,7 +843,7 @@ class SeamlessM4Tv2PreTrainedModel(SeamlessM4TPreTrainedModel):
|
||||
Args:
|
||||
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
|
||||
subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
|
||||
Corresponding text string for each input id.
|
||||
char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||
Number of characters per input id.
|
||||
@ -1042,7 +1001,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
decoder_ffn_dim=config.decoder_ffn_dim,
|
||||
)
|
||||
)
|
||||
self.layers = nn.ModuleList(layers)
|
||||
self.layers = nn.Modulelist(layers)
|
||||
self.layer_norm = nn.LayerNorm(config.hidden_size)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
@ -1063,7 +1022,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
|
||||
) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
|
||||
r"""
|
||||
Args:
|
||||
char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
|
||||
@ -1101,7 +1060,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
|
||||
# predict duration
|
||||
log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
|
||||
dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
|
||||
dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
|
||||
dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
|
||||
|
||||
# upsample char hidden states according to predicted duration
|
||||
@ -1120,7 +1079,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for idx, decoder_layer in enumerate(self.layers):
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
# add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
if self.training:
|
||||
@ -1188,12 +1147,12 @@ class SeamlessM4Tv2TextToUnitModel(SeamlessM4TTextToUnitModel):
|
||||
char_input_ids: Optional[torch.LongTensor] = None,
|
||||
char_count_per_id: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
|
||||
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -1262,14 +1221,14 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4TTextToUnitForCo
|
||||
char_input_ids: Optional[torch.LongTensor] = None,
|
||||
char_count_per_id: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
|
||||
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
||||
r"""
|
||||
char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
|
||||
Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
|
||||
@ -1407,7 +1366,7 @@ class SeamlessM4Tv2CodeHifiGan(SeamlessM4TCodeHifiGan, nn.Module):
|
||||
# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
|
||||
def forward(
|
||||
self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
@ -1504,15 +1463,15 @@ SEAMLESS_M4T_V2_T2T_INPUTS_DOCSTRING = r"""
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
logits_processor (`LogitsProcessorlist`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
stopping_criteria (`StoppingCriterialist`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
@ -1555,15 +1514,15 @@ SEAMLESS_M4T_V2_S2T_INPUTS_DOCSTRING = r"""
|
||||
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
|
||||
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
|
||||
default values, whose documentation should be checked to parameterize generation.
|
||||
logits_processor (`LogitsProcessorList`, *optional*):
|
||||
logits_processor (`LogitsProcessorlist`, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
stopping_criteria (`StoppingCriteriaList`, *optional*):
|
||||
stopping_criteria (`StoppingCriterialist`, *optional*):
|
||||
Custom stopping criteria that complement the default stopping criteria built from arguments and a
|
||||
generation config. If a stopping criteria is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
@ -1664,7 +1623,7 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4TForTextToSpeech):
|
||||
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
|
||||
- If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
|
||||
sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
@ -1874,7 +1833,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4TForSpeechToSpeech):
|
||||
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
|
||||
- If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
|
||||
sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
@ -2103,7 +2062,7 @@ class SeamlessM4Tv2Model(SeamlessM4TModel):
|
||||
other.
|
||||
|
||||
Returns:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor], ModelOutput]`:
|
||||
`Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor], ModelOutput]`:
|
||||
- If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
|
||||
- If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
|
||||
shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
|
||||
|
Loading…
Reference in New Issue
Block a user