mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
parent
340500b1a9
commit
cbf924b76c
@ -156,7 +156,7 @@ Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/mo
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
|
||||
Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below):
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
@ -166,7 +166,7 @@ Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
|
||||
Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below):
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
|
||||
@ -222,7 +222,7 @@ Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als
|
||||
Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
|
||||
|
||||
* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
|
||||
* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
|
||||
* [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
|
||||
|
||||
Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
|
||||
|
||||
|
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
@ -62,7 +62,7 @@ for _ in range(max_new_tokens):
|
||||
# Greedily sample one next token
|
||||
next_token_ids = outputs.logits[:, -1:].argmax(-1)
|
||||
generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
|
||||
# Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token
|
||||
# Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
|
||||
# and expanding attn mask for the new token, as explained above
|
||||
attention_mask = inputs["attention_mask"]
|
||||
attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
|
||||
@ -88,7 +88,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", to
|
||||
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
||||
|
||||
# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
|
||||
# in the the legacy format
|
||||
# in the legacy format
|
||||
generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
|
||||
|
||||
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
||||
|
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
|
||||
|
||||
Multimodal templates are included in the [Processor](./processors) class and requires an additional `type` key for specifying whether the included content is an image, video, or text.
|
||||
Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
|
||||
|
||||
This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
|
||||
|
||||
@ -109,7 +109,7 @@ These inputs are now ready to be used in [`~GenerationMixin.generate`].
|
||||
|
||||
Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
|
||||
|
||||
- The content `"type"` should be `"video"` to indicate the the content is a video.
|
||||
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||
|
||||
> [!WARNING]
|
||||
@ -141,7 +141,7 @@ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input
|
||||
|
||||
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
||||
|
||||
The examples below uses Decord as the backend because it is a bit faster than PyAV.
|
||||
The examples below use Decord as the backend because it is a bit faster than PyAV.
|
||||
|
||||
<hfoptions id="sampling">
|
||||
<hfoption id="fixed number of frames">
|
||||
|
@ -131,7 +131,7 @@ class ResnetModel(PreTrainedModel):
|
||||
</hfoption>
|
||||
<hfoption id="ResnetModelForImageClassification">
|
||||
|
||||
The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
|
||||
The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
|
||||
|
||||
> [!TIP]
|
||||
> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
|
||||
|
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
@ -56,7 +56,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
||||
|
||||
### Order of GPUs
|
||||
|
||||
To select specific GPUs to use and their order, configure the the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
|
||||
To select specific GPUs to use and their order, configure the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
||||
|
@ -220,7 +220,7 @@ Pasa tu texto al tokenizador:
|
||||
El tokenizador devolverá un diccionario conteniendo:
|
||||
|
||||
* [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
|
||||
* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
|
||||
* [attention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
|
||||
|
||||
Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
|
||||
|
||||
|
@ -23,7 +23,7 @@ Abbiamo integrato di recente `BetterTransformer` per fare inferenza più rapidam
|
||||
|
||||
## PyTorch JIT-mode (TorchScript)
|
||||
|
||||
TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programmma TorchScript può esere salvato da un processo Python e caricato in un processo dove non ci sono dipendenze Python.
|
||||
TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programma TorchScript può esere salvato da un processo Python e caricato in un processo dove non ci sono dipendenze Python.
|
||||
Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion.
|
||||
|
||||
Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
|
||||
|
@ -222,7 +222,7 @@ Passe o texto para o tokenizer:
|
||||
O tokenizer retornará um dicionário contendo:
|
||||
|
||||
* [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
|
||||
* [atttention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
|
||||
* [attention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
|
||||
|
||||
Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
|
||||
|
||||
|
@ -918,7 +918,7 @@ def add_model_to_main_init(
|
||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||
frameworks (`List[str]`, *optional*):
|
||||
If specified, only the models implemented in those frameworks will be added.
|
||||
with_processsing (`bool`, *optional*, defaults to `True`):
|
||||
with_processing (`bool`, *optional*, defaults to `True`):
|
||||
Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
|
||||
"""
|
||||
with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
|
||||
|
@ -94,7 +94,7 @@ VideoInput = Union[
|
||||
list["np.ndarray"],
|
||||
list["torch.Tensor"],
|
||||
list[list["PIL.Image.Image"]],
|
||||
list[list["np.ndarrray"]],
|
||||
list[list["np.ndarray"]],
|
||||
list[list["torch.Tensor"]],
|
||||
] # noqa
|
||||
|
||||
|
@ -83,7 +83,7 @@ class AlignProcessor(ProcessorMixin):
|
||||
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` arguments to
|
||||
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
|
||||
to the doctsring of the above two methods for more information.
|
||||
to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
|
@ -68,7 +68,7 @@ class AltCLIPProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -123,7 +123,7 @@ FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
|
||||
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Image-classsification
|
||||
# Model for Image-classification
|
||||
("beit", "FlaxBeitForImageClassification"),
|
||||
("dinov2", "FlaxDinov2ForImageClassification"),
|
||||
("regnet", "FlaxRegNetForImageClassification"),
|
||||
|
@ -39,7 +39,7 @@ class BambaConfig(PretrainedConfig):
|
||||
`inputs_ids` passed when calling [`BambaModel`]
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
|
||||
model has a output word embedding layer.
|
||||
model has an output word embedding layer.
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
@ -85,7 +85,7 @@ class BambaConfig(PretrainedConfig):
|
||||
mamba_n_heads (`int`, *optional*, defaults to 128):
|
||||
The number of mamba heads used in the v2 implementation.
|
||||
mamba_d_head (`int`, *optional*, defaults to `"auto"`):
|
||||
Head embeddding dimension size
|
||||
Head embedding dimension size
|
||||
mamba_n_groups (`int`, *optional*, defaults to 1):
|
||||
The number of the mamba groups used in the v2 implementation.
|
||||
mamba_d_state (`int`, *optional*, defaults to 256):
|
||||
|
@ -190,12 +190,12 @@ def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
|
||||
output_new_model = output_new_model_total.logits[:, [-1], :]
|
||||
|
||||
else:
|
||||
prediction_codeboook_channel = 3
|
||||
prediction_codebook_channel = 3
|
||||
n_codes_total = 8
|
||||
vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
|
||||
|
||||
output_new_model_total = model(prediction_codeboook_channel, vec)
|
||||
output_old_model = bark_model(prediction_codeboook_channel, vec)
|
||||
output_new_model_total = model(prediction_codebook_channel, vec)
|
||||
output_old_model = bark_model(prediction_codebook_channel, vec)
|
||||
|
||||
output_new_model = output_new_model_total.logits
|
||||
|
||||
|
@ -87,7 +87,7 @@ class ChameleonProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -78,7 +78,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -46,7 +46,7 @@ class ClapProcessor(ProcessorMixin):
|
||||
and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
|
||||
encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
|
||||
doctsring of the above two methods for more information.
|
||||
docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `List[str]`, `List[List[str]]`):
|
||||
|
@ -63,7 +63,7 @@ class CLIPProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -63,7 +63,7 @@ class CLIPSegProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
|
||||
ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
|
||||
the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -48,7 +48,7 @@ class ClvpProcessor(ProcessorMixin):
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
|
||||
argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
||||
argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
|
||||
|
@ -100,11 +100,11 @@ class ColPaliProcessor(PaliGemmaProcessor):
|
||||
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
||||
both text and images at the same time.
|
||||
|
||||
When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||
When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||
[`~LlamaTokenizerFast.__call__`].
|
||||
When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||
[`~SiglipImageProcessor.__call__`].
|
||||
Please refer to the doctsring of the above two methods for more information.
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
|
@ -140,11 +140,11 @@ class ColPaliProcessor(ProcessorMixin):
|
||||
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
|
||||
both text and images at the same time.
|
||||
|
||||
When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||
When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
|
||||
[`~LlamaTokenizerFast.__call__`].
|
||||
When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
|
||||
[`~SiglipImageProcessor.__call__`].
|
||||
Please refer to the doctsring of the above two methods for more information.
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
|
@ -1303,7 +1303,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
|
||||
n_ctx (`int`, *optional*):
|
||||
Number of tokens or lyrics tokens provided in a single pass.
|
||||
embed_dim (`int`, *optional*):
|
||||
Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
|
||||
Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
|
||||
if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
|
||||
audio_conditioning (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the prior supports conditionning on audio.
|
||||
@ -1921,7 +1921,7 @@ class JukeboxPrior(PreTrainedModel):
|
||||
|
||||
def set_metadata_lyric_tokens(self, labels):
|
||||
"""
|
||||
Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
|
||||
Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
|
||||
"""
|
||||
if self.nb_relevant_lyric_tokens > 0:
|
||||
tokens_list = torch.zeros(
|
||||
@ -2147,7 +2147,7 @@ class JukeboxPrior(PreTrainedModel):
|
||||
|
||||
def get_encoder_states(self, lyric_tokens, sample=False):
|
||||
"""
|
||||
Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
|
||||
Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
|
||||
the lyric encoder.
|
||||
"""
|
||||
if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
|
||||
|
@ -49,7 +49,7 @@ class MCTCTProcessor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
|
||||
[`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
|
||||
[`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
||||
[`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
if self._in_target_context_manager:
|
||||
|
@ -50,7 +50,7 @@ class Speech2Text2Processor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
|
||||
[`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||
[`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
|
||||
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
|
||||
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two
|
||||
methods for more information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
|
@ -86,7 +86,7 @@ class DonutProcessor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
||||
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
||||
[`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
|
||||
[`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
||||
[`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
legacy = kwargs.pop("legacy", True)
|
||||
|
@ -95,7 +95,7 @@ class Emu3Processor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -481,7 +481,7 @@ class FuyuProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
|
||||
encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -66,7 +66,7 @@ class GitProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -103,7 +103,7 @@ class LlavaProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -106,7 +106,7 @@ class LlavaNextProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -123,7 +123,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
|
||||
this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
|
||||
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring
|
||||
[`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -114,7 +114,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -81,7 +81,7 @@ class MgpstrProcessor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
|
||||
[`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
|
||||
arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
|
||||
refer to the doctsring of the above methods for more information.
|
||||
refer to the docstring of the above methods for more information.
|
||||
"""
|
||||
if images is None and text is None:
|
||||
raise ValueError("You need to specify either an `images` or `text` input to process.")
|
||||
|
@ -53,7 +53,7 @@ class MusicgenProcessor(ProcessorMixin):
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
|
||||
argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
||||
argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
|
@ -54,7 +54,7 @@ class MusicgenMelodyProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
||||
and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
|
||||
`None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
|
||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
|
||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
|
@ -82,7 +82,7 @@ class OneFormerProcessor(ProcessorMixin):
|
||||
`task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
|
||||
`None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
|
||||
doctsring of the above two methods for more information.
|
||||
docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
task_inputs (`str`, `List[str]`):
|
||||
|
@ -96,7 +96,7 @@ class Owlv2Processor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -110,7 +110,7 @@ class OwlViTProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
|
||||
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -162,7 +162,7 @@ class PaliGemmaProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
|
||||
|
@ -119,7 +119,7 @@ class PixtralProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -80,7 +80,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
|
||||
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -80,7 +80,7 @@ class RegNetConvLayer(nn.Module):
|
||||
|
||||
class RegNetEmbeddings(nn.Module):
|
||||
"""
|
||||
RegNet Embedddings (stem) composed of a single aggressive convolution.
|
||||
RegNet Embeddings (stem) composed of a single aggressive convolution.
|
||||
"""
|
||||
|
||||
def __init__(self, config: RegNetConfig):
|
||||
|
@ -47,7 +47,7 @@ class SeamlessM4TProcessor(ProcessorMixin):
|
||||
and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
|
||||
SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
|
||||
to the doctsring of the above two methods for more information.
|
||||
to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `List[str]`, `List[List[str]]`):
|
||||
|
@ -59,7 +59,7 @@ class SiglipProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -79,7 +79,7 @@ class Siglip2Processor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` argument to
|
||||
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -51,7 +51,7 @@ class Speech2TextProcessor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
||||
[`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
|
||||
[`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
|
||||
[`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
||||
[`~Speech2TextTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
|
@ -81,7 +81,7 @@ class TrOCRProcessor(ProcessorMixin):
|
||||
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
|
||||
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
|
||||
[`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
|
||||
[`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
|
||||
[`~TrOCRTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
if self._in_target_context_manager:
|
||||
|
@ -51,7 +51,7 @@ class TvpProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
|
||||
TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
|
||||
the above two methods for more information.
|
||||
|
||||
Args:
|
||||
@ -59,7 +59,7 @@ class TvpProcessor(ProcessorMixin):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
|
||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
|
||||
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||
|
@ -103,7 +103,7 @@ class VideoLlavaProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
||||
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -66,7 +66,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
|
||||
`None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
||||
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
||||
AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
||||
of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
|
@ -81,7 +81,7 @@ class Wav2Vec2BertProcessor(ProcessorMixin):
|
||||
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
|
||||
and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
|
||||
`None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
|
||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
|
||||
PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||
@ -127,7 +127,7 @@ class Wav2Vec2BertProcessor(ProcessorMixin):
|
||||
"""
|
||||
If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
|
||||
If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
|
||||
Please refer to the doctsring of the above two methods for more information.
|
||||
Please refer to the docstring of the above two methods for more information.
|
||||
"""
|
||||
if input_features is None and labels is None:
|
||||
raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
|
||||
|
@ -48,7 +48,7 @@ class WhisperProcessor(ProcessorMixin):
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
|
||||
argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
|
||||
argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
|
||||
information.
|
||||
"""
|
||||
# For backward compatibility
|
||||
|
@ -65,14 +65,14 @@ class XCLIPProcessor(ProcessorMixin):
|
||||
and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
|
||||
VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
|
||||
doctsring of the above two methods for more information.
|
||||
docstring of the above two methods for more information.
|
||||
|
||||
Args:
|
||||
text (`str`, `List[str]`, `List[List[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
|
||||
videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
|
||||
`List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
|
||||
of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
|
||||
each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
|
||||
|
@ -337,7 +337,7 @@ class OnnxConfig(ABC):
|
||||
" `preprocessor` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
||||
logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||
preprocessor = tokenizer
|
||||
if isinstance(preprocessor, PreTrainedTokenizerBase):
|
||||
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
|
||||
|
@ -118,7 +118,7 @@ def export_pytorch(
|
||||
" `preprocessor` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||
preprocessor = tokenizer
|
||||
|
||||
if issubclass(type(model), PreTrainedModel):
|
||||
@ -221,7 +221,7 @@ def export_tensorflow(
|
||||
" `preprocessor` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||
preprocessor = tokenizer
|
||||
|
||||
model.config.return_dict = True
|
||||
@ -296,7 +296,7 @@ def export(
|
||||
" `preprocessor` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||
preprocessor = tokenizer
|
||||
|
||||
if is_torch_available():
|
||||
@ -335,7 +335,7 @@ def validate_model_outputs(
|
||||
" `preprocessor` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
|
||||
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
|
||||
preprocessor = tokenizer
|
||||
|
||||
# generate inputs with a different batch_size and seq_len that was used for conversion to properly test
|
||||
|
@ -193,7 +193,7 @@ class HqqHfQuantizer(HfQuantizer):
|
||||
unexpected_keys: List[str],
|
||||
):
|
||||
"""
|
||||
Each nn.Linear layer is processsed here.
|
||||
Each nn.Linear layer is processed here.
|
||||
We first check if the corresponding module state_dict contains already HQQ quantized parameters.
|
||||
If not, we create a temp linear layer with the module state_dict params and use it for quantization
|
||||
"""
|
||||
|
@ -355,7 +355,7 @@ class ModelOutput(OrderedDict):
|
||||
|
||||
if is_modeloutput_subclass and not is_dataclass(self):
|
||||
raise TypeError(
|
||||
f"{self.__module__}.{self.__class__.__name__} is not a dataclasss."
|
||||
f"{self.__module__}.{self.__class__.__name__} is not a dataclass."
|
||||
" This is a subclass of ModelOutput and so must use the @dataclass decorator."
|
||||
)
|
||||
|
||||
|
@ -241,19 +241,19 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
@ -311,7 +311,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||
|
||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||
@ -333,7 +333,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_single(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
@ -355,7 +355,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
|
||||
@ -382,7 +382,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_batch(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
|
||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||
prompts = [
|
||||
@ -408,7 +408,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "rhymes-ai/Aria"
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
@ -442,7 +442,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
|
||||
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||
@ -460,7 +460,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
|
@ -253,7 +253,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@ -306,19 +306,19 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
@ -345,7 +345,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||
|
||||
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
|
||||
@ -364,7 +364,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_single(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
@ -386,7 +386,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
|
||||
@ -413,7 +413,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_batch(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
|
||||
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
|
||||
prompts = [
|
||||
@ -441,7 +441,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_bitsandbytes
|
||||
def test_small_model_integration_test_llama_batched_regression(self):
|
||||
# Let' s make sure we test the preprocessing to replace what is used
|
||||
# Let's make sure we test the preprocessing to replace what is used
|
||||
model_id = "llava-hf/llava-1.5-7b-hf"
|
||||
|
||||
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
|
||||
@ -478,7 +478,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
|
||||
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
|
||||
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
|
||||
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
|
||||
@ -496,7 +496,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
model = model.eval()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
"\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
|
||||
"\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
|
||||
]
|
||||
@ -617,7 +617,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
||||
output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
|
||||
EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
||||
EXPECTED_GENERATION = "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip
|
||||
self.assertEqual(output, EXPECTED_GENERATION)
|
||||
|
||||
@slow
|
||||
|
@ -237,7 +237,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
@ -234,7 +234,7 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
@ -231,7 +231,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
def test_mismatching_num_image_tokens(self):
|
||||
"""
|
||||
Tests that VLMs through an error with explicit message saying what is wrong
|
||||
when number of images don't match number of image tokens in the text.
|
||||
when number of images doesn't match number of image tokens in the text.
|
||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
@ -458,7 +458,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the the generated tokens with the expected tokens
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
|
@ -1090,7 +1090,7 @@ class ProcessorTesterMixin:
|
||||
]
|
||||
]
|
||||
|
||||
def dummmy_sample_indices_fn(metadata, **fn_kwargs):
|
||||
def dummy_sample_indices_fn(metadata, **fn_kwargs):
|
||||
# sample only the first two frame always
|
||||
return [0, 1]
|
||||
|
||||
@ -1099,7 +1099,7 @@ class ProcessorTesterMixin:
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
sample_indices_fn=dummmy_sample_indices_fn,
|
||||
sample_indices_fn=dummy_sample_indices_fn,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
|
@ -429,7 +429,7 @@ class ImageFeatureExtractionTester(unittest.TestCase):
|
||||
self.assertEqual(len(videos_list), 1)
|
||||
self.assertTrue(np.array_equal(videos_list[0][0], images))
|
||||
|
||||
# Test a 4d array of images is converted to a a list of 1 video
|
||||
# Test a 4d array of images is converted to a list of 1 video
|
||||
images = np.random.randint(0, 256, (4, 16, 32, 3))
|
||||
videos_list = make_batched_videos(images)
|
||||
self.assertIsInstance(videos_list[0], list)
|
||||
|
Loading…
Reference in New Issue
Block a user