From 018855de636538aeaf9f49c596f9682431d87f53 Mon Sep 17 00:00:00 2001 From: Drew Ross Date: Thu, 26 Jun 2025 15:54:48 -0500 Subject: [PATCH 1/9] Update PEGASUS-X model card (#38971) * Update PEGASUS-X model card * Add cache_implementation argument in quantization code example * Update CLI example * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Remove TensorFlow and Flax badges --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/pegasus_x.md | 114 ++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md index 379e0362bb7..d581b2e9a38 100644 --- a/docs/source/en/model_doc/pegasus_x.md +++ b/docs/source/en/model_doc/pegasus_x.md @@ -14,35 +14,115 @@ rendered properly in your Markdown viewer. --> -# PEGASUS-X - -
-PyTorch -FlashAttention +
+
+ PyTorch + FlashAttention +
-## Overview +# PEGASUS-X -The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://huggingface.co/papers/2208.04347) by Jason Phang, Yao Zhao and Peter J. Liu. +[PEGASUS-X](https://huggingface.co/papers/2208.04347) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [Pegasus](./pegasus) model with staggered block-local attention, global encoder tokens, and additional pretraining on long text sequences, enabling it to handle inputs of up to 16,000 tokens. PEGASUS-X matches the performance of much larger models while using fewer parameters. -PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder. +You can find all the original PEGASUS-X checkpoints under the [Google](https://huggingface.co/google/models?search=pegasus-x) organization. -The abstract from the paper is the following: +> [!TIP] +> This model was contributed by [zphang](https://huggingface.co/zphang). +> +> Click on the PEGASUS-X models in the right sidebar for more examples of how to apply PEGASUS-X to different language tasks. -*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.* +The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line. -This model was contributed by [zphang](https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus). + + -## Documentation resources +```py +import torch +from transformers import pipeline -- [Translation task guide](../tasks/translation) -- [Summarization task guide](../tasks/summarization) +pipeline = pipeline( + task="summarization", + model="google/pegasus-x-large", + torch_dtype=torch.bfloat16, + device=0 +) +pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. +Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems. +These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure. +This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""") +``` + + - +```py +import torch +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM -PEGASUS-X uses the same tokenizer as [PEGASUS](pegasus). +tokenizer = AutoTokenizer.from_pretrained( + "google/pegasus-x-large" +) +model = AutoModelForSeq2SeqLM.from_pretrained( + "google/pegasus-x-large", + torch_dtype=torch.bfloat16, + device_map="auto", +) - +input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. +Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems. +These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure. +This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +output = model.generate(**input_ids, cache_implementation="static") +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + + + +```bash +echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/pegasus-x-large --device 0 +``` + + + +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. + +The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4. + +```py +import torch +from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer + +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" +) +model = AutoModelForSeq2SeqLM.from_pretrained( + "google/pegasus-x-large", + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=quantization_config +) + +tokenizer = AutoTokenizer.from_pretrained( + "google/pegasus-x-large" +) + +input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. +Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems. +These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure. +This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +output = model.generate(**input_ids, cache_implementation="static") +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +## Notes + +- PEGASUS-X also uses the [`PegasusTokenizer`]. ## PegasusXConfig From 84e8696caebea4cc8afb16a62d5eaae29f01fdd9 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 26 Jun 2025 14:21:54 -0700 Subject: [PATCH 2/9] [docs] @auto_docstring (#39011) * refactor * feedback --- docs/source/en/_toctree.yml | 2 +- docs/source/en/auto_docstring.md | 215 +++++++++++++------------ docs/source/en/modular_transformers.md | 3 + 3 files changed, 112 insertions(+), 108 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index e3fce128d33..26f4602df82 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -22,7 +22,7 @@ - local: add_new_model title: Legacy model contribution - local: auto_docstring - title: Document your models + title: Documenting a model - local: attention_interface title: Customizing attention function title: Models diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md index 19058c00eb2..298a501dbf4 100644 --- a/docs/source/en/auto_docstring.md +++ b/docs/source/en/auto_docstring.md @@ -14,43 +14,26 @@ rendered properly in your Markdown viewer. --> -# Utilizing the @auto_docstring Decorator +# Documenting a model -The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions. +The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments. ---- +This guide describes how to use the `@auto_docstring` decorator and how it works. -## 📜 How it Works +## @auto_docstring -The `@auto_docstring` decorator constructs docstrings by: - -1. **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function. -2. **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`). -3. **Overriding or Adding Arguments Descriptions:** - * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions. - * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. -4. **Adding Classes and Functions Introduction:** - * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring. - * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source. -5. **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`. -6. **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers. -7. **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block. -8. **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`. - - ---- - -## 🚀 How to Use @auto_docstring - -### 1. Importing the Decorator -Import the decorator into your modeling file: +Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`). ```python from ...utils import auto_docstring ``` -### 2. Applying to Classes -Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions. +Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it. + + + + +Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring. ```python from transformers.modeling_utils import PreTrainedModel @@ -73,9 +56,7 @@ class MyAwesomeModel(PreTrainedModel): # ... other methods ``` -#### Advanced Class Decoration: - -Arguments can be passed directly to `@auto_docstring` for more control: +Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments. ```python @auto_docstring( @@ -93,7 +74,7 @@ class MySpecialModel(PreTrainedModel): # ... ``` -Or: +You can also choose to only use `custom_intro` and define the custom arguments directly in the class. ```python @auto_docstring( @@ -111,8 +92,10 @@ class MySpecialModel(PreTrainedModel): # ... ``` -### 3. Applying to Functions (e.g., `forward` method) -Apply the decorator above method definitions, such as the `forward` method. + + + +Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature. ```python @auto_docstring @@ -131,9 +114,10 @@ Apply the decorator above method definitions, such as the `forward` method. # ... ``` -#### Advanced Function Decoration: +Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments. + +The `Returns` and `Examples` parts of the docstring can also be manually specified. -Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified: ```python MODEL_COMMON_CUSTOM_ARGS = r""" @@ -180,100 +164,117 @@ class MyModel(PreTrainedModel): # ... ``` ---- + + -### ✍️ Documenting Arguments: Approach & Priority +## Documenting arguments -1. **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):** - * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`. +There are some rules for documenting different types of arguments and they're listed below. + +- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`. + + If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding. + + +- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class. + + ```py + argument_name (`type`, *optional*, defaults to `X`): + Description of the argument. + Explain its purpose, expected shape/type if complex, and default behavior. + This can span multiple lines. + ``` -2. **New or Custom Arguments:** - * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters). - * **Format:** - ``` - argument_name (`type`, *optional*, defaults to `X`): - Description of the argument. - Explain its purpose, expected shape/type if complex, and default behavior. - This can span multiple lines. - ``` * Include `type` in backticks. - * Add "*optional*" if the argument is not required (has a default value). - * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`). + * Add *optional* if the argument is not required or has a default value. + * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`. -3. **Overriding Standard Arguments:** - * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence. - * The `labels` argument is often customized per model and typically requires a specific docstring. + These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. -4. **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):** - * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. + ```py + class MyModel(PreTrainedModel): + # ... + @auto_docstring( + custom_intro=""" + This is a custom introduction for the function. + """ + custom_args=r""" + common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`): + Description of common_arg_1 + """ + ) + ``` ---- +## Checking the docstrings -### Usage with [modular files](./modular_transformers) +Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria. -When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator: +* Ensures `@auto_docstring` is applied to relevant mode classes and public methods. +* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged. +* Reminds you to complete placeholders like `` and ``. +* Ensures docstrings are formatted according to the expected docstring style. -- **For standalone models in modular files:** - Apply the `@auto_docstring` decorator just as you would in regular modeling files. - -- **For models inheriting from other library models:** - - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file. - - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class. - - > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file. - - -**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly. - ---- - -## ✅ Checking Your Docstrings with `check_auto_docstrings` - -The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI). - -#### What it Checks: - -* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO) -* **Argument Completeness & Consistency:** - * Flags arguments in the signature that are not known standard arguments and lack a local description. - * Ensures documented arguments exist in the signature. (TODO) - * Verifies that types and default values in the docstring match the signature. (TODO) -* **Placeholder Detection:** Reminds you to complete placeholders like `` or ``. -* **Formatting:** Adherence to the expected docstring style. - -#### Running the Check Locally: - -Run this check locally before committing. The common command is: +You can run this check locally - before committing - by running the following command. ```bash make fix-copies ``` -Alternatively, to only perform docstrings and auto-docstring checks, you can use: +`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks. ```bash python utils/check_docstrings.py # to only check files included in the diff without fixing them -# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff -# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files +# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff +# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files ``` -#### Workflow with the Checker: +## modular_model.py files -1. Add `@auto_docstring(...)` to the class or method. -2. For new, custom, or overridden arguments, add descriptions in an `r""" """` block. -3. Run `make fix-copies` (or the `check_docstrings.py` utility). - * For unrecognized arguments lacking documentation, the utility will create placeholder entries. -4. Manually edit these placeholders with accurate types and descriptions. -5. Re-run the check to ensure all issues are resolved. +When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`. ---- +- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file. +- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file. -## 🔑 Key Takeaways & Best Practices + If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class. -* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.). -* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class. -* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model. +> [!WARNING] +> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file. + +## How it works + +The `@auto_docstring` decorator automatically generates docstrings by: + +1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function. +2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file. +3. Adding argument descriptions in one of two ways as shown below. + + | method | description | usage | + |---|---|---| + | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions | + | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file | + +4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`. + + `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function. + +5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`. + +6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers. + +7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation. + + For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specifiy a custom `Returns` field in a functions docstring. + +8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring. + + Currently only supported for [`FastImageProcessorKwargs`]. + +## Best practices + +Follow the best practices below to help maintain consistent and informative documentation for Transformers! + +* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`. +* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring. +* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model. * Document new or custom arguments clearly. * Run `check_docstrings` locally and iteratively. - -By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗. diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index a7224994da3..76d77e2ffd5 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -540,6 +540,9 @@ This makes it very easy to switch decorators and makes it explicit that the only ## Docstring variables +> [!TIP] +> Refer to the [Documeting a model](./auto_docstring) guide for more information about how you can use the `@auto_docstring` decorator to help automatically generate consistent docstring arguments. + If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file. This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2). From a52478253bbe522a420e88ea3940d4d98a935300 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 26 Jun 2025 14:40:45 -0700 Subject: [PATCH 3/9] [docs] Tensor parallelism (#38241) * updates * feedback * badges * fix? * fix? * fix? * fix? --- docs/source/en/_toctree.yml | 2 +- docs/source/en/model_doc/cohere.md | 1 + docs/source/en/model_doc/cohere2.md | 1 + docs/source/en/model_doc/gemma.md | 1 + docs/source/en/model_doc/gemma2.md | 1 + docs/source/en/model_doc/glm.md | 1 + docs/source/en/model_doc/granite.md | 1 + docs/source/en/model_doc/llama.md | 1 + docs/source/en/model_doc/llama2.md | 1 + docs/source/en/model_doc/llama3.md | 1 + docs/source/en/model_doc/llama4.md | 1 + docs/source/en/model_doc/mistral.md | 1 + docs/source/en/model_doc/mixtral.md | 1 + docs/source/en/model_doc/olmo.md | 1 + docs/source/en/model_doc/phi.md | 1 + docs/source/en/model_doc/phi3.md | 1 + docs/source/en/model_doc/qwen2.md | 1 + docs/source/en/model_doc/qwen2_moe.md | 1 + docs/source/en/model_doc/qwen2_vl.md | 1 + docs/source/en/model_doc/starcoder2.md | 1 + docs/source/en/perf_infer_gpu_multi.md | 392 ++++++++++++------------- docs/source/en/perf_train_gpu_many.md | 2 + 22 files changed, 209 insertions(+), 206 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 26f4602df82..f569a09e588 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -97,7 +97,7 @@ - local: perf_infer_gpu_one title: GPU - local: perf_infer_gpu_multi - title: Distributed GPU inference + title: Distributed inference - local: perf_infer_cpu title: CPU - local: tf_xla diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 21ae73c9477..08087b14c46 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -3,6 +3,7 @@ PyTorch FlashAttention SDPA + Tensor parallelism
diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index 3b0b6e1740a..24f64966639 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -4,6 +4,7 @@ PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index 416d3ac85cf..63e4d0409fd 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -23,6 +23,7 @@ rendered properly in your Markdown viewer. "> FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index 50c08803000..84f11b1eb24 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. "> FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index bf5b95ac14f..4a1618459b0 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 0f54db1bd2e..bdc71c2997a 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism # Granite diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index bcdca5583a6..183775bcadb 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. "> FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md index 5365fa1767f..a2e697e89d1 100644 --- a/docs/source/en/model_doc/llama2.md +++ b/docs/source/en/model_doc/llama2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer. PyTorch Flax + Tensor parallelism diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md index 0bb5e8160c9..ab5c4862c49 100644 --- a/docs/source/en/model_doc/llama3.md +++ b/docs/source/en/model_doc/llama3.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +Tensor parallelism ```py3 diff --git a/docs/source/en/model_doc/llama4.md b/docs/source/en/model_doc/llama4.md index 8e2cd3a2786..07f0919fba3 100644 --- a/docs/source/en/model_doc/llama4.md +++ b/docs/source/en/model_doc/llama4.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention + Tensor parallelism
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 331449eeacd..f41a486dbbe 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. "> FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 38c0c98ed0b..e0688f35bef 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md index c0d227cb549..efa56ce0af8 100644 --- a/docs/source/en/model_doc/olmo.md +++ b/docs/source/en/model_doc/olmo.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index 1fff19ef829..10f53eb583e 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index 41753bff5bc..77444d7955b 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 1d0c2b9a527..899d9dddf59 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA + Tensor parallelism diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 0030449a51c..b25ff9b7a3b 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism # Qwen2MoE diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index 39ddbdc006a..926cb5bc4dd 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +Tensor parallelism
## Overview diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 61e70b18fd8..ecb405f4d21 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch FlashAttention SDPA +Tensor parallelism ## Overview diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 37a41c51a4a..f269960d3fc 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -13,21 +13,19 @@ rendered properly in your Markdown viewer. --> -# Tensor parallelism in transformers +# Distributed inference -[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice. -This document assumes that you are already familiar with the basics of tensor parallelism. If you are not, please refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism. +When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice. + +However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple GPUs to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case. > [!TIP] -> Tensor parallelism is very communication intensive, therefore it is reccomended to use it on a single machine with multiple GPUs, utilizing fast intra-node communication. For multi-node training, methods as pipeline or data parallelism are more efficient (depending on your use case). +> Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism to learn more. -Tensor parallelism requires slight changes to the model parameters, therefore in transformers, we support some of the popular models out of the box. - -> [!TIP] -> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below. +Check the list below for models that natively support tensor parallelism. Open a GitHub issue or pull request to add support for a model.
-Supported models +Show supported models * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2) * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2) @@ -43,19 +41,74 @@ Tensor parallelism requires slight changes to the model parameters, therefore in
-## Using 🤗 transformers +This guide shows how to enable tensor parallelism with Transformers and different partitioning strategies. -Transformers provides a simple interface to use for tensor parallelism. We provide multiple classes implementing different partitioning -strategies and a simple entrypoint to parallelize `nn.Module` instance. You won't have to interact with this interface directly, everything is done in `PretrainedModel.from_pretrained` method for you. This section will first talk about the partitioning strategies -we support, then the user interface you will be interacting with, and finally it will teach you how to extend it with your own partitioning -strategies. +## Partitioning a model -### Partitioning strategies +Transformers supports tensor parallelism if a model has a `tp_plan`. There are two plans to partition a model. -In transformers, partitioning strategies reside in a class `ParallelInterface` which works like a mapping from string to the strategy implementation. +- The `auto` tensor parallelism plan partitions a model (see the supported models above) based on a predefined configuration. +- You can also manually specify your own partitioning plan and pass it to the `tp_plan` parameter in [`~PreTrainedModel.from_pretrained`]. + + -```python +```py +import os +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs + +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto") +print(model._tp_plan) + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") +prompt = "Can I help" +inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) + +# distributed run +outputs = model(inputs) +``` + +Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU. + +```bash +torchrun --nproc-per-node 4 demo.py +``` + + + + +Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses a combination of column and row partitioning. Refer to the [Partitioning strategies](#partitioning-strategies) section to learn about other supported partitioning strategies. + +> [!WARNING] +> Manually specifying your own partitioning plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about the partitioning strategies, the resulting model can be very slow, even failing or incorrect. Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) to learn more. + +```py +from transformers import AutoModelForCausalLM + +tp_plan = { + "model.layers.*.self_attn.q_proj": "colwise", + "model.layers.*.self_attn.k_proj": "colwise", + "model.layers.*.self_attn.v_proj": "colwise", + "model.layers.*.self_attn.o_proj": "rowwise", + ... +} + +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan) +print(model._tp_plan) +``` + + + + +## Partitioning strategies + +All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available. + +```py class ParallelInterface(MutableMapping): """ Dict-like object keeping track of allowed attention functions. You can easily add a new attention function @@ -77,66 +130,32 @@ class ParallelInterface(MutableMapping): } ``` -We support the following strategies: +Refer to the table below to learn more about each strategy. -- `ColwiseParallel` - A simple column-wise partitioning, being able to handle both weights and biases, does exactly what we've discussed before. -- `RowwiseParallel` - Again, row-wise partitioning as dicussed before, supports weights and biases, on top of that it also supports `nn.Embedding` modules. -- `SequenceParallel` - Sequence parallel implementation, for support of `LayerNorm` and `Dropout` layers. Also supports Python implementation of `RMSNorm` (see [this](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34)) -- `PackedColwiseParallel` - A variant of column-wise partitioning, however it works on packed weights (i.e. `up_proj` and `gate_proj` being packed together). For more details, see [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) -- `PackedRowwiseParallel` - A variant of row-wise partitioning, works on packed weights, for more details check the comment linked above. -- `GatherParallel` - A very simple class, that only makes the outputs of the module to be gathered across devices. -- `IsolatedParallel` - This is a special case, where we want to *isolate* the module from the rest of the devices (world). This is used for Experts in MoE layers, basically creating Expert parallelism of sorts. -- `ReplicateParallel` - Many `torch.distributed` APIs break if model is partially sharded, so this class is used to replicate the module across all devices. +| Strategy | Description | +|---|---| +| `ColwiseParallel` | Column-wise partitioning of weights and biases. | +| `RowwiseParallel` | Row-wise partitioning of weights and biases. Also supports partitioning `nn.Embedding` modules. | +| `SequenceParallel` | Sequence parallel implementation to support `LayerNorm` and `Dropout` layers. Also supports Python implementation of [RMSNorm](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34). | +| `PackedColwiseParallel` | Variant of `ColwiseParallel` to support packed weights (for example, packing `up_proj` and `gate_proj` together). Refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details. | +| `PackedRowwiseParallel` | Variant of `RowwiseParallel` to support packed weights (refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details). | +| `GatherParallel` | Gather outputs of the module across devices. | +| `IsolatedParallel` | Used for Experts in Mixture-of-Experts (MoE) layers to isolates module from other devices. | +| `ReplicateParallel` | Replicate modules across all devices to prevent `torch.distributed` APIs from breaking due to a partially sharded model. | -### Sharding a model +### Packed strategies -We provide two ways to shard a model, first one is to use `auto` tensor parallelism plan, which will automatically shard the model based on our predefined configuration. This requires the model to have predefined tensor parallel plan in transformers. +Weight packing packs multiple linear layers into a single, bigger layer. Packed strategies, `PackedColwiseParallel` and `PackedRowwiseParallel`, are used to shard packed weights. The more basic `ColwiseParallel` or `RowwiseParallel` will incorrectly shard the packed weights. -```python -from transformers import AutoModelForCausalLM +The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` module and requires the `PackedRowwiseParallel` strategy to shard `gate_up_proj`. -# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs -model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies - -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto") - -print(model._tp_plan) -``` - -> [!TIP] -> For a list of models that support tensor parallelism, see the [Supported models](#supported-models) section above. - -The second way is to manually specify your own partitioning plan. - -```python -from transformers import AutoModelForCausalLM - -tp_plan = { - "model.layers.*.self_attn.q_proj": "colwise", - "model.layers.*.self_attn.k_proj": "colwise", - "model.layers.*.self_attn.v_proj": "colwise", - "model.layers.*.self_attn.o_proj": "rowwise", - ... -} - -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan) - -print(model._tp_plan) -``` - -You might have noticed that there are some special cases in the `ParallelInterface` mapping, let's now talk about them. This will help you understand their purpose and help with extending to other strategies. - -### PackedRowwiseParallel -This class is a special case of `RowwiseParallel`, it's used to shard packed weights. Weight packing is a common technique used in models. It's a technique where we pack multiple linear layers into a single, bigger one. - -For example in `Llama4` model, we pack `up_proj` and `gate_proj` into a single `gate_up_proj` module. ```python class Llama4TextExperts(nn.Module): ... self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim)) ``` -Then in forward, we can use batch matrix multiplication to compute the output of the `gate_up_proj` module. +Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module. ```python def forward(self, hidden_states): @@ -145,185 +164,148 @@ def forward(self, hidden_states): gate, up = gate_up.chunk(2, dim=-1) # Split the output into gate and up ``` -In this case, we need to use the `PackedRowwiseParallel` strategy to shard the `gate_up_proj` module, as using a simple `RowwiseParallel` will shard the layers wrongly. - > [!TIP] -> If this is a bit difficult to wrap your head around, check out [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an amazing visual representation of why `Packed*` needs to be used. +> Refer to [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an visual representation of why `Packed*` needs to be used. +### Local strategies -### `local*` strategies +Local strategies (`local_colwise`, `local_rowwise`, `local_packed_rowwise`) don't use [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) because it isn't supported for some operations such as [torch.chunk](https://docs.pytorch.org/docs/stable/generated/torch.chunk.html). Instead, local strategies use the basic [torch.Tensor](https://docs.pytorch.org/docs/stable/tensors.html) and performs some of the distributed logic manually. -You could have noticed that there are `local*` strategies, which use the same layers as `*` strategy, but don't use `DTensor` at all. -This is because `DTensor` is not supported for some of the operations: such as `torch.chunk`. Therefore, sometimes we need to use the `local*` strategies, which use vanilla `torch.Tensor` and do some of the distributed logic manually. - - -> [!WARNING] -> Manually specifying your own partitiong plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about this, the resulting model can be very slow, even failing or incorrect. Again, refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) which can teach you everything required. +## Custom partitioning strategies -### Extending the interface with your own partitioning strategies +A custom partitioning strategy should inherit from [`TensorParallelLayer`](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. -This is a very advanced topic, which requires a good understanding of distributed collectives and the model architecture. -Your custom partitioning strategy should inherit from `TensorParallelLayer` defined in [integrations/tensor_parallel.py](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement: `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. Then it should be registered in the `ParallelInterface` mapping, so our dispatching logic can find it when specified in the `tp_plan`. +Then it needs to be registered in the `ParallelInterface` mapping so the dispatching logic can find it when specified in `tp_plan`. -Let's go through this workflow step by step, on an already existing example: `ColwiseParallel`. +The example below shows how to implement `ColwiseParallel` with this workflow. -1. Inherit from `TensorParallelLayer` and initialization +1. Inherit from `TensorParallelLayer`. In the `__init__` method, define `input_layouts` and `output_layouts` to describe how the input and output tensors should be placed on devices. The `desired_input_layouts` attribute is used to specify how the input *should* be placed on devices. -```python -class ColwiseParallel(TensorParallelLayer): - def __init__( + ```python + class ColwiseParallel(TensorParallelLayer): + def __init__( + self, + *, + input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer + output_layouts: Optional[Placement] = None, # The output layout we want to achieve + use_local_output: bool = True, # Whether to use local output or not + use_dtensor=True, # Whether to use DTensor or not + ): + self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer + self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding + self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs + self.use_local_output = use_local_output + self.use_dtensor = use_dtensor + ``` + +2. Implement the `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn` methods. + + The `partition_tensor` method partitions the tensor and fills `empty_param` with the partitioned tensor. Use the utility function `get_tensor_shard` to help you get the correct shard of the original parameter for a given rank and `get_packed_weights` to help with packed weights. + + ```python + def partition_tensor( self, - *, - input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer - output_layouts: Optional[Placement] = None, # The output layout we want to achieve - use_local_output: bool = True, # Whether to use local output or not - use_dtensor=True, # Whether to use DTensor or not - ): - self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer - self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding - self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs - self.use_local_output = use_local_output - self.use_dtensor = use_dtensor -``` + param, # Full tensor of the parameter + empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor + param_type, # Type of the parameter, `bias` or `weight` + param_casting_dtype, # The type to cast the parameter to + to_contiguous, # Whether to convert the tensor to a contiguous memory layout + rank, # The rank of the current device + device_mesh, # The device mesh + ) -> nn.Parameter: # Return the partitioned parameter + ... + ``` -In the `__init__` method, we define these attributes, where `input_layouts` and `output_layouts` describing, how the input and output tensors should be placed on the devices. `desired_input_layouts` is used to specify, how the input *SHOULD* be placed on the devices. + The `_prepare_input_fn` and `_prepare_output_fn` methods are used in the [pre-forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks. They redistribute the inputs and outputs to the desired layout as specified in the `__init__`. -2a. Implement `partition_tensor` method + ```python + def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh): + ... + # Do some custom logic, cast to DTensor etc. + ... + return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh) + def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh): + ... + # Do some custom logic, cast to DTensor etc. + ... + return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh) + ``` -```python -def partition_tensor( - self, - param, # Full tensor of the parameter - empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor - param_type, # Type of the parameter, `bias` or `weight` - param_casting_dtype, # The type to cast the parameter to - to_contiguous, # Whether to convert the tensor to a contiguous memory layout - rank, # The rank of the current device - device_mesh, # The device mesh -) -> nn.Parameter: # Return the partitioned parameter - ... -``` +3. Register the strategy to [`ParallelInterface`] to enable it for use with `tp_plan`. -This method is used to partition the tensor, and fill the `empty_param` with the partitioned tensor. -We provide some utility functions to help you with this, such as `get_tensor_shard` which will get you the correct shard of the original parameter for this rank or `get_packed_weights` to help with packed weights. + ```python + from transformers.integrations.tensor_parallel import ParallelInterface -2b. Implement `_prepare_input_fn` and `_prepare_output_fn` methods + ParallelInterface.register_strategy("colwise_custom", ColwiseParallel) + tp_plan = { + "model.layers.*.self_attn.q_proj": "colwise_custom", + ... + } + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan) + ``` -These methods are used as [`pre-forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [`forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks respectively. Their purpose is to re-distribute the inputs and outputs to the desired layout, passed in the `__init__` method. +## Benchmarks -```python -def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh): - ... - # Do some custom logic, cast to DTensor etc. - ... - return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh) +Tensor parallelism can considerably speedup inference, especially for inputs with large batch sizes or long sequences. -def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh): - ... - # Do some custom logic, cast to DTensor etc. - ... - return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh) -``` - -3. Register the strategy -Congratulations! You've implemented your own partitioning strategy. Now, to use it with your own `tp_plan`, you need to register it in the `ParallelInterface` mapping. - -```python -from transformers.integrations.tensor_parallel import ParallelInterface - -ParallelInterface.register_strategy("colwise_custom", ColwiseParallel) -``` - -And now you can use it in your `tp_plan` as such: - -```python -tp_plan = { - "model.layers.*.self_attn.q_proj": "colwise_custom", - ... -} - -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan) -``` - - -## Full example - -Let's go through a full example of inference with tensor parallelism. -```python -import os -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - - -# enable tensor parallelism -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Meta-Llama-3-8B-Instruct", - tp_plan="auto", -) - -# prepare input tokens -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") -prompt = "Can I help" -inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) - -# distributed run -outputs = model(inputs) -``` - -Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU. - -```bash -torchrun --nproc-per-node 4 demo.py -``` - -You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences. - -For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups. +Refer to the chart below for the expected speedup for a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512.
-## Tensor parallelism in-depth -Our implementation of tensor parallelism is framework-agnostic in design, but the specific implementations we've developed rely on the torch.distributed package. We heavily utilize abstractions such as `DeviceMesh` or `DTensor` to provide a simple and extensible interface to the user. +## Design implementation + +The Transformers tensor parallelism implementation is framework-agnostic, but for specific implementations, we rely on [DeviceMesh](https://docs.pytorch.org/tutorials/recipes/distributed_device_mesh.html) and [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) from [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) to provide a simple and extensible interface. ### DeviceMesh -Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, therefore we can create a `DeviceMesh` with multiple submeshes: + +Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, so you can create a `DeviceMesh` with multiple sub-meshes. + ```python from torch.distributed.device_mesh import init_device_mesh # Create a 1D mesh of 4 GPUs device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"]) ``` -Then, most of the `torch.distributed` defined parallelization strategies can be applied to a mesh itself, or its submesh, automatically handling the communication patterns. + +Most of the `torch.distributed` defined parallelization strategies can be applied to the mesh itself, or its sub-mesh, and it automatically handles the communication patterns. ### DTensor -Abbreviation for Distributed Tensor, `DTensor` is a tensor subclass that handles the distributed logic on-top of the usual tensor operations. Most of the model weights in case of tensor parallelism are stored as `DTensor`s (with some exceptions, more on that later). -The most important part of DTensor, that is crucial to understand, is the `placement` attribute. It's an attribute that tells PyTorch how is the tensor placed on the devices of the `DeviceMesh`. +`DTensor` (Distributed Tensor) is a tensor subclass that handles the distributed logic on top of the usual tensor operations. Most of the model weights in tensor parallelism are stored as `DTensor`s. -It can have the following values: +The most important part of DTensor is the `placement` attribute because it tells PyTorch how a tensor is placed on the devices in `DeviceMesh`. The `placement` attribute can take the following values. -- `Shard(dimension)` - Annotates that this `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. For example, if we would like to shard weights for column-wise partitioning, we would do: -```python -weight = ... -weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension -bias = ... -bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension -``` +- `Shard(dimension)` - Indicates how a `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. The example below demonstrates how to shard weights over different dimensions for column-wise partitioning. -To give another example, for row-wise partitioning, we would do: -```python -weight = ... -weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension -bias = ... -bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs -``` + ```python + weight = ... + weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension + bias = ... + bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension + ``` -- `Replicate()` - Annotates that this `DTensor` is replicated across the `DeviceMesh`. Very straight-forward, only creates a full copy of the tensor on each device. -- `Partial()` - This placement is mostly of no interest to us, it's used to annotate that this tensor is pending a reduction operation. + This example demonstrates how to shard weights over different dimensions for row-wise partitioning. + + ```python + weight = ... + weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension + bias = ... + bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs + ``` + +- `Replicate()` - Indicates a `DTensor` is replicated across the `DeviceMesh`. It only creates a full copy of the tensor on each device. + + ```py + bias = ... + bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs + ``` + +- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers). \ No newline at end of file diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md index 3dd0845e671..7fdbb9d8afe 100644 --- a/docs/source/en/perf_train_gpu_many.md +++ b/docs/source/en/perf_train_gpu_many.md @@ -91,6 +91,8 @@ Tensor parallelism distributes large tensor computations across multiple GPUs. T Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs. +Refer to the [Tensor parallelism](./perf_infer_gpu_multi) guide to learn how to use it for inference. + ## Hybrid parallelism Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters. From 1ccc73dee9018dad5dcbadff31851d7c663b8b51 Mon Sep 17 00:00:00 2001 From: eustlb <94853470+eustlb@users.noreply.github.com> Date: Fri, 27 Jun 2025 11:27:42 +0200 Subject: [PATCH 4/9] [Whisper] fix shape mismatch in tests (#39074) fix shape mismatch --- tests/models/whisper/test_modeling_whisper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 1b4641f5d49..860ec88b847 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -2040,7 +2040,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): [50365, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50629, 50682, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50870, 50911, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 949, 505, 11, 51245, 51287, 1034, 4680, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51494, 51523, 634, 575, 12525, 22618, 1968, 6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 51799, 51815, 50365, 293, 393, 4411, 50431] ]) # fmt: on - torch.testing.assert_close(generated_ids[0], EXPECTED_OUTPUT) + torch.testing.assert_close(generated_ids, EXPECTED_OUTPUT) EXPECTED_TRANSCRIPT = [ { From 0d66ef77921fc77644fe698f2c7c3f49cdd0ffc0 Mon Sep 17 00:00:00 2001 From: Yaswanth Gali <82788246+yaswanth19@users.noreply.github.com> Date: Fri, 27 Jun 2025 15:44:09 +0530 Subject: [PATCH 5/9] Cleanup Attention class for Siglip and dependent models (#39040) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cleanup attention class * More models * more models * Changes * make style * Should fix CI * This should work 🙏 --- .../models/altclip/modeling_altclip.py | 1 + .../models/clipseg/modeling_clipseg.py | 1 + src/transformers/models/emu3/modeling_emu3.py | 13 ++----------- src/transformers/models/git/modeling_git.py | 1 + src/transformers/models/idefics/vision.py | 1 + .../models/idefics2/modeling_idefics2.py | 13 ++----------- .../models/idefics3/modeling_idefics3.py | 13 ++----------- .../models/siglip/modeling_siglip.py | 19 +++---------------- .../models/siglip2/modeling_siglip2.py | 18 +++--------------- .../models/smolvlm/modeling_smolvlm.py | 13 ++----------- .../models/t5gemma/modeling_t5gemma.py | 13 ++----------- .../models/t5gemma/modular_t5gemma.py | 13 ++----------- .../models/x_clip/modeling_x_clip.py | 1 + 13 files changed, 23 insertions(+), 97 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index a8c319f5ec2..8f6f0ff7fbc 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -623,6 +623,7 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index c30d92fcdbf..732712c517c 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -275,6 +275,7 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 7a3a177c432..3487138234b 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -606,7 +606,7 @@ class Emu3VQVAEAttentionBlock(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -622,13 +622,7 @@ class Emu3VQVAEAttentionBlock(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -644,9 +638,6 @@ class Emu3VQVAEAttentionBlock(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index a116ecb5517..805192cf5a1 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -620,6 +620,7 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index d75d61545ec..c92bd7ba9c4 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -185,6 +185,7 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 792d5fe3f46..9757a42049f 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -219,7 +219,7 @@ class Idefics2VisionAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -235,13 +235,7 @@ class Idefics2VisionAttention(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -257,9 +251,6 @@ class Idefics2VisionAttention(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index bb57db42229..a2e0bc78d0f 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -216,7 +216,7 @@ class Idefics3VisionAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -232,13 +232,7 @@ class Idefics3VisionAttention(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -254,9 +248,6 @@ class Idefics3VisionAttention(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index e56d5bfc89a..b8d6d50f9ae 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -21,7 +21,6 @@ from typing import Any, Callable, Optional, Union import numpy as np import torch -import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from torch.nn.init import _calculate_fan_in_and_fan_out @@ -31,13 +30,10 @@ from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, torch_int +from ...utils import ModelOutput, auto_docstring, can_return_tuple, torch_int from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig -logger = logging.get_logger(__name__) - - def _trunc_normal_(tensor, mean, std, a, b): # Cut & paste from PyTorch official master until it's in a few official releases - RW # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf @@ -372,7 +368,7 @@ class SiglipAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -388,13 +384,7 @@ class SiglipAttention(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -410,9 +400,6 @@ class SiglipAttention(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index bb147b1ce2c..876a84e0259 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -35,13 +35,10 @@ from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging +from ...utils import ModelOutput, auto_docstring, can_return_tuple from .configuration_siglip2 import Siglip2Config, Siglip2TextConfig, Siglip2VisionConfig -logger = logging.get_logger(__name__) - - @dataclass @auto_docstring( custom_intro=""" @@ -266,7 +263,7 @@ class Siglip2Attention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -282,13 +279,7 @@ class Siglip2Attention(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -304,9 +295,6 @@ class Siglip2Attention(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index f775c371c3d..1b128a0fb63 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -186,7 +186,7 @@ class SmolVLMVisionAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Input shape: Batch x Time x Channel""" @@ -202,13 +202,7 @@ class SmolVLMVisionAttention(nn.Module): attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and output_attentions: - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -224,9 +218,6 @@ class SmolVLMVisionAttention(nn.Module): attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index a7d60d2fa78..feccf6d7d9f 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -1008,8 +1008,6 @@ class T5GemmaModel(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - - **flash_attn_kwargs: flash attention related parameters. """ use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1084,10 +1082,6 @@ class T5GemmaEncoderModel(T5GemmaPreTrainedModel): output_hidden_states: Optional[bool] = None, **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> BaseModelOutput: - r""" - **flash_attn_kwargs: flash attention related parameters. - """ - encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, @@ -1162,7 +1156,6 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -1234,7 +1227,7 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): @auto_docstring class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None): - """ + r""" is_encoder_decoder (`Optional`, *optional*): Whether use encoder_decoder for sequence classification. When set to False, only encoder is used. """ @@ -1286,7 +1279,6 @@ class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1382,7 +1374,7 @@ class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): @auto_docstring class T5GemmaForTokenClassification(T5GemmaPreTrainedModel): def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None): - """ + r""" is_encoder_decoder (`Optional`, *optional*): Whether use encoder_decoder for token classification. When set to False, only encoder is used. """ @@ -1435,7 +1427,6 @@ class T5GemmaForTokenClassification(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index b3dbe761a22..ae69ae99100 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -955,8 +955,6 @@ class T5GemmaModel(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - - **flash_attn_kwargs: flash attention related parameters. """ use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1031,10 +1029,6 @@ class T5GemmaEncoderModel(T5GemmaPreTrainedModel): output_hidden_states: Optional[bool] = None, **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> BaseModelOutput: - r""" - **flash_attn_kwargs: flash attention related parameters. - """ - encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, @@ -1109,7 +1103,6 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored @@ -1181,7 +1174,7 @@ class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin): @auto_docstring class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None): - """ + r""" is_encoder_decoder (`Optional`, *optional*): Whether use encoder_decoder for sequence classification. When set to False, only encoder is used. """ @@ -1233,7 +1226,6 @@ class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1329,7 +1321,7 @@ class T5GemmaForSequenceClassification(T5GemmaPreTrainedModel): @auto_docstring class T5GemmaForTokenClassification(T5GemmaPreTrainedModel): def __init__(self, config: T5GemmaConfig, is_encoder_decoder: Optional[bool] = None): - """ + r""" is_encoder_decoder (`Optional`, *optional*): Whether use encoder_decoder for token classification. When set to False, only encoder is used. """ @@ -1382,7 +1374,6 @@ class T5GemmaForTokenClassification(T5GemmaPreTrainedModel): decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 7a90c695dc3..0e043f354ee 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -240,6 +240,7 @@ def eager_attention_forward( attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights From 540a10848c26ebec9a0e749d3808333bdae08167 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 27 Jun 2025 12:28:10 +0200 Subject: [PATCH 6/9] fix `Gemma3nProcessorTest` (#39068) * fix * fix * oups forgot style --------- Co-authored-by: ydshieh Co-authored-by: Cyril Vallez --- tests/models/gemma3n/test_processing_gemma3n.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/models/gemma3n/test_processing_gemma3n.py b/tests/models/gemma3n/test_processing_gemma3n.py index 1d30a80c489..ffedb2b98aa 100644 --- a/tests/models/gemma3n/test_processing_gemma3n.py +++ b/tests/models/gemma3n/test_processing_gemma3n.py @@ -36,7 +36,7 @@ if is_speech_available(): class Gemma3nProcessorTest(unittest.TestCase): def setUp(self): # TODO: update to google? - self.model_id = "Google/gemma-3n-E4B-it" + self.model_id = "hf-internal-testing/namespace-google-repo_name-gemma-3n-E4B-it" self.tmpdirname = tempfile.mkdtemp(suffix="gemma3n") self.maxDiff = None @@ -71,6 +71,9 @@ class Gemma3nProcessorTest(unittest.TestCase): self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + # `disable_grouping` is a new attribute that got added on main while gemma3n was being released - so was + # not part of the saved processor + del processor.feature_extractor.disable_grouping self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) @@ -94,6 +97,9 @@ class Gemma3nProcessorTest(unittest.TestCase): self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, GemmaTokenizerFast) + # `disable_grouping` is a new attribute that got added on main while gemma3n was being released - so was + # not part of the saved processor + del processor.feature_extractor.disable_grouping self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, Gemma3nAudioFeatureExtractor) From 371c4711136386075bfb272692860c1d4ee9c1d2 Mon Sep 17 00:00:00 2001 From: BUI Van Tuan <37981884+bvantuan@users.noreply.github.com> Date: Fri, 27 Jun 2025 12:39:37 +0200 Subject: [PATCH 7/9] Fix initialization of OneFormer (#38901) * fix initialization of OneFormer * remove redundant initializations * remove redundant initializations * remove redundant initializations * keep BC --- .../models/oneformer/modeling_oneformer.py | 51 ++++------------ .../oneformer/test_modeling_oneformer.py | 58 +++++++++++++++---- 2 files changed, 57 insertions(+), 52 deletions(-) diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 05e22056a51..28eadd3a489 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -2773,7 +2773,6 @@ class OneFormerPreTrainedModel(PreTrainedModel): elif isinstance(module, OneFormerTransformerDecoder): nn.init.xavier_uniform_(module.query_input_projection.weight, gain=xavier_std) nn.init.constant_(module.query_input_projection.bias, 0) - module.query_input_projection._is_hf_initialized = True elif isinstance(module, OneFormerPixelDecoderEncoderMultiscaleDeformableAttention): nn.init.constant_(module.sampling_offsets.weight.data, 0.0) thetas = torch.arange(module.n_heads, dtype=torch.int64).float() * (2.0 * math.pi / module.n_heads) @@ -2793,24 +2792,9 @@ class OneFormerPreTrainedModel(PreTrainedModel): nn.init.constant_(module.value_proj.bias.data, 0.0) nn.init.xavier_uniform_(module.output_proj.weight.data) nn.init.constant_(module.output_proj.bias.data, 0.0) - elif isinstance(module, OneFormerPixelDecoderEncoderOnly): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) elif isinstance(module, OneFormerPixelDecoder): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) nn.init.normal_(module.level_embed, std=0) - elif isinstance(module, OneFormerTransformerDecoderSelfAttentionLayer): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p, gain=xavier_std) - elif isinstance(module, OneFormerTransformerDecoderCrossAttentionLayer): - for p in module.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p, gain=xavier_std) - elif isinstance(module, OneFormerTransformerDecoderFFNLayer): + elif isinstance(module, OneFormerTransformerDecoderLayer): for p in module.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p, gain=xavier_std) @@ -2818,21 +2802,6 @@ class OneFormerPreTrainedModel(PreTrainedModel): for p in module.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p, gain=xavier_std) - elif isinstance(module, OneFormerPixelLevelModule): - for submodule in module.modules(): - if isinstance(submodule, (nn.Conv2d, nn.Linear)): - submodule.weight.data.normal_(mean=0.0, std=std) - if submodule.bias is not None: - submodule.bias.data.zero_() - elif isinstance(module, OneFormerTextContextDecoder): - for submodule in module.modules(): - if isinstance(submodule, nn.Linear): - nn.init.trunc_normal_(submodule.weight, std=0.02) - if isinstance(submodule, nn.Linear) and submodule.bias is not None: - nn.init.constant_(submodule.bias, 0) - elif isinstance(submodule, nn.LayerNorm): - nn.init.constant_(submodule.bias, 0) - nn.init.constant_(submodule.weight, 1.0) elif isinstance(module, OneFormerTextTransformer): proj_std = (module.width**-0.5) * ((2 * module.num_layers) ** -0.5) attn_std = module.width**-0.5 @@ -2848,16 +2817,11 @@ class OneFormerPreTrainedModel(PreTrainedModel): if hasattr(module, "reference_points"): nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) nn.init.constant_(module.reference_points.bias.data, 0.0) - elif isinstance(module, OneFormerTaskModel): + elif isinstance(module, OneFormerMLPPredictionHead): for submodule in module.modules(): - if isinstance(module, OneFormerMLPPredictionHead): - for submodule in module.modules(): - if isinstance(submodule, nn.Linear): - nn.init.xavier_uniform_(submodule.weight, gain=xavier_std) - nn.init.constant_(submodule.bias, 0) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) + if isinstance(submodule, nn.Linear): + nn.init.xavier_uniform_(submodule.weight, gain=xavier_std) + nn.init.constant_(submodule.bias, 0) elif isinstance(module, nn.MultiheadAttention): module.in_proj_weight.data.normal_(mean=0.0, std=std) module.in_proj_bias.data.zero_() @@ -2865,10 +2829,15 @@ class OneFormerPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() + elif isinstance(module, OneFormerLoss): + module.logit_scale.data.fill_(np.log(1 / self.config.contrastive_temperature)) @auto_docstring diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py index 0ce791dd3c9..58a93a8c4fa 100644 --- a/tests/models/oneformer/test_modeling_oneformer.py +++ b/tests/models/oneformer/test_modeling_oneformer.py @@ -13,14 +13,13 @@ # limitations under the License. """Testing suite for the PyTorch OneFormer model.""" -import copy import inspect import unittest import numpy as np from tests.test_modeling_common import floats_tensor -from transformers import OneFormerConfig, is_torch_available, is_vision_available +from transformers import AutoModelForImageClassification, OneFormerConfig, is_torch_available, is_vision_available from transformers.testing_utils import ( is_flaky, require_timm, @@ -35,7 +34,7 @@ from transformers.testing_utils import ( from transformers.utils import cached_property from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin +from ...test_modeling_common import ModelTesterMixin, _config_zero_init from ...test_pipeline_mixin import PipelineTesterMixin @@ -51,14 +50,6 @@ if is_vision_available(): from PIL import Image -def _config_zero_init(config): - configs_no_init = copy.deepcopy(config) - for key in configs_no_init.__dict__.keys(): - if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: - setattr(configs_no_init, key, 1e-10) - return configs_no_init - - class OneFormerModelTester: def __init__( self, @@ -375,6 +366,7 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.is_training = True config.contrastive_temperature = 1 configs_no_init = _config_zero_init(config) @@ -382,12 +374,56 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if param.requires_grad: + if ( + "self_attn.sampling_offsets.bias" in name + or "self_attn.value_proj.weight" in name + or "self_attn.output_proj.weight" in name + or "self_attn.in_proj_weight" in name + or "self_attn.out_proj.weight" in name + or "mlp.fc1.weight" in name + or "mlp.fc2.weight" in name + or "text_mapper.text_encoder.positional_embedding" in name + or "text_mapper.text_encoder.token_embedding.weight" in name + ): + continue self.assertIn( ((param.data.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + def test_initialization_pretrained_backbone(self): + backbone_name = "microsoft/resnet-18" + + # load OneFormerConfig config with a pretrained backbone + config = OneFormerConfig( + backbone=backbone_name, + use_pretrained_backbone=True, + ) + + # load pretrained backbone + backbone_model = AutoModelForImageClassification.from_pretrained(backbone_name, device_map=torch_device) + + def params_match(params1, params2): + return all((p1 == p2).all() for p1, p2 in zip(params1, params2)) + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device).eval() + if model.__class__.__name__ == "OneFormerModel": + self.assertTrue( + params_match( + backbone_model.base_model.encoder.parameters(), + model.pixel_level_module.encoder.encoder.parameters(), + ) + ) + elif model.__class__.__name__ == "OneFormerForUniversalSegmentation": + self.assertTrue( + params_match( + backbone_model.base_model.encoder.parameters(), + model.model.pixel_level_module.encoder.encoder.parameters(), + ) + ) + def test_training(self): if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False") From cb17103bd5e31373e090f2f37602dcc992c017e4 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Fri, 27 Jun 2025 13:51:46 +0200 Subject: [PATCH 8/9] Uninstallling Flash attention from quantization docker (#39078) * update * revert --- docker/transformers-quantization-latest-gpu/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index c860dabd6ac..ad9cf891e25 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -93,6 +93,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs RUN python3 -m pip uninstall -y kernels +# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131 +RUN python3 -m pip uninstall -y flash-attn + # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop From 0106a50a6bcf6eb0d4ef28dfda68e8becc3531e3 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Fri, 27 Jun 2025 20:01:53 +0800 Subject: [PATCH 9/9] fix a bunch of XPU UT failures on stock PyTorch 2.7 and 2.8 (#39069) * fix a bunch of XPU UT failures on stock PyTorch 2.7 and 2.8 Signed-off-by: YAO Matrix * qwen3 Signed-off-by: YAO Matrix * quanto Signed-off-by: YAO Matrix * models Signed-off-by: YAO Matrix * fix style Signed-off-by: YAO Matrix * idefics2 Signed-off-by: YAO Matrix --------- Signed-off-by: YAO Matrix --- tests/models/aria/test_modeling_aria.py | 34 +++++++++++-------- .../aya_vision/test_modeling_aya_vision.py | 5 +-- tests/models/gpt2/test_modeling_gpt2.py | 1 + .../models/idefics2/test_modeling_idefics2.py | 1 + .../test_modeling_llava_onevision.py | 7 ++-- tests/models/mixtral/test_modeling_mixtral.py | 2 ++ .../models/qwen2_vl/test_modeling_qwen2_vl.py | 25 +++++++++----- tests/models/qwen3/test_modeling_qwen3.py | 1 + .../quanto_integration/test_quanto.py | 8 +++-- 9 files changed, 53 insertions(+), 31 deletions(-) diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py index 1a2c72a72bf..747963aa50e 100644 --- a/tests/models/aria/test_modeling_aria.py +++ b/tests/models/aria/test_modeling_aria.py @@ -30,6 +30,7 @@ from transformers import ( ) from transformers.models.idefics3 import Idefics3VisionConfig from transformers.testing_utils import ( + Expectations, backend_empty_cache, require_bitsandbytes, require_torch, @@ -483,23 +484,26 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase): device=model.device, dtype=model.dtype ) - EXPECTED_OUTPUT = { - "cpu": [ - "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", - "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a grassy hill. The alpaca has", - ], # cpu output - "cuda": [ - "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", - "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The", - ], # cuda output - "xpu": [ - "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", - "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a grassy hill. The alpaca has", - ], # xpu output - } + EXPECTED_OUTPUTS = Expectations( + { + ("cpu", None): [ + "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", + "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a grassy hill. The alpaca has", + ], + ("cuda", None): [ + "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", + "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The", + ], + ("xpu", 3): [ + "<|im_start|>user\n \n \n USER: What's the difference of two images?\n ASSISTANT: \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with", + "<|im_start|>user\n \n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The", + ], + } + ) # fmt: skip + EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation() generate_ids = model.generate(**inputs, max_new_tokens=20) outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) - self.assertListEqual(outputs, EXPECTED_OUTPUT[model.device.type]) + self.assertListEqual(outputs, EXPECTED_OUTPUT) def test_tokenizer_integration(self): model_id = "rhymes-ai/Aria" diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index eaa5aebe846..5cde1f216ec 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -422,7 +422,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { - ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.", + ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.", # 4-bit ("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n", ("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.", @@ -434,6 +434,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): @slow @require_torch_accelerator + @require_deterministic_for_xpu def test_small_model_integration_generate_chat_template(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) model = self.get_model() @@ -458,7 +459,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { - ("xpu", 3): "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,", + ("xpu", 3): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,', # 4-bit ("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,', ("cuda", 8): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,', diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 64ebd236a23..d0796468c39 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -823,6 +823,7 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase): ("rocm", None): 'Today is a nice day and we can do this again."\n\nDana said that she will', ("rocm", (9, 5)): "Today is a nice day and if you don't know anything about the state of play during your holiday", ("cuda", None): "Today is a nice day and if you don't know anything about the state of play during your holiday", + ("xpu", 3): "Today is a nice day and if you don't know anything about the state of play during your holiday", } ) # fmt: skip EXPECTED_OUTPUT = expected_outputs.get_expectation() diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index f8f2ac414d1..6ce19ddfade 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -624,6 +624,7 @@ class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): expected_generated_texts = Expectations( { + ("xpu", 3): "In this image, we see the Statue of Liberty, the Hudson River,", ("cuda", None): "In this image, we see the Statue of Liberty, the Hudson River,", ("rocm", (9, 5)): "In this image, we see the Statue of Liberty, the New York City", } diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 9915d47e0e2..f482f0a0680 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -389,16 +389,15 @@ class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase): EXPECTED_DECODED_TEXTS = Expectations( { + ("xpu", 3): 'user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different model or method. The models are color-coded and labeled with their respective names. The axes are labeled with terms such as "VQA," "GQA," "MQA," "VQAv2," "MM-Vet," "LLaVA-Bench," "LLaVA-1', ("cuda", 7): 'user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different model or method. The models are color-coded and labeled with their respective names. The axes are labeled with terms such as "VQA," "GQA," "MQA," "VQAv2," "MM-Vet," "LLaVA-Bench," "LLaVA-1', ("cuda", 8): 'user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different model or method. The models are color-coded and labeled with their respective names. The axes are labeled with terms such as "VQA," "GQA," "MQA," "VIZ," "TextVQA," "SQA-IMG," and "MQE." The radar chart shows', } ) # fmt: skip EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() + DECODED_TEXT = self.processor.decode(output[0], skip_special_tokens=True) - self.assertEqual( - self.processor.decode(output[0], skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) + self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT) @slow @require_bitsandbytes diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 3b53e1cfa53..94ceb0e4a70 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -194,6 +194,7 @@ class MixtralIntegrationTest(unittest.TestCase): # fmt: off EXPECTED_LOGITS_LEFT_UNPADDED = Expectations( { + ("xpu", 3): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7070, 0.2461]]).to(torch_device), ("cuda", 7): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2275, 0.6054], [0.2656, -0.7070, 0.2460]]).to(torch_device), ("cuda", 8): torch.Tensor([[0.2207, 0.5234, -0.3828], [0.8203, -0.2285, 0.6055], [0.2656, -0.7109, 0.2451]]).to(torch_device), ("rocm", 9): torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2285, 0.6055], [0.2637, -0.7109, 0.2451]]).to(torch_device), @@ -203,6 +204,7 @@ class MixtralIntegrationTest(unittest.TestCase): EXPECTED_LOGITS_RIGHT_UNPADDED = Expectations( { + ("xpu", 3): torch.Tensor([[0.2178, 0.1270, -0.1641], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to(torch_device), ("cuda", 7): torch.Tensor([[0.2167, 0.1269, -0.1640], [-0.3496, 0.2988, -1.0312], [0.0688, 0.7929, 0.8007]]).to(torch_device), ("cuda", 8): torch.Tensor([[0.2178, 0.1270, -0.1621], [-0.3496, 0.3008, -1.0312], [0.0693, 0.7930, 0.7969]]).to(torch_device), ("rocm", 9): torch.Tensor([[0.2197, 0.1250, -0.1611], [-0.3516, 0.3008, -1.0312], [0.0684, 0.7930, 0.8008]]).to(torch_device), diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 5299b6a2c11..72669fd390f 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -28,6 +28,7 @@ from transformers import ( is_vision_available, ) from transformers.testing_utils import ( + Expectations, backend_empty_cache, require_flash_attn, require_torch, @@ -482,15 +483,23 @@ class Qwen2VLIntegrationTest(unittest.TestCase): # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) + DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True) - EXPECTED_DECODED_TEXT = [ - 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices', - 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets' - ] # fmt: skip - self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) + EXPECTED_DECODED_TEXTS = Expectations( + { + ("xpu", 3): [ + 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices', + 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices', + ], + ("cuda", None): [ + 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices', + 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets', + ], + } + ) # fmt: skip + EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() + + self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT) @slow @require_flash_attn diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py index 3f3f5bae083..5f961ac79e0 100644 --- a/tests/models/qwen3/test_modeling_qwen3.py +++ b/tests/models/qwen3/test_modeling_qwen3.py @@ -207,6 +207,7 @@ class Qwen3IntegrationTest(unittest.TestCase): def test_speculative_generation(self): EXPECTED_TEXT_COMPLETIONS = Expectations( { + ("xpu", 3): "My favourite condiment is 100% peanut butter. I love it so much that I can't help but use it", ("cuda", 7): "My favourite condiment is 100% natural. It's a little spicy and a little sweet, but it's the", ("cuda", 8): "My favourite condiment is 100% peanut butter. I love it so much that I can't help but use it", } diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py index 766faafbbfa..a4e0b478697 100644 --- a/tests/quantization/quanto_integration/test_quanto.py +++ b/tests/quantization/quanto_integration/test_quanto.py @@ -223,7 +223,9 @@ class QuantoQuantizationTest(unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: with self.assertRaises(ValueError) as e: self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False) - self.assertIn("The model is quantized with quanto and is not serializable", str(e.exception)) + self.assertIn( + "The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception) + ) # TODO: replace by the following when it works # quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( # tmpdirname, torch_dtype=torch.float32, device_map="cpu" @@ -237,7 +239,9 @@ class QuantoQuantizationTest(unittest.TestCase): with tempfile.TemporaryDirectory() as tmpdirname: with self.assertRaises(ValueError) as e: self.quantized_model.save_pretrained(tmpdirname) - self.assertIn("The model is quantized with quanto and is not serializable", str(e.exception)) + self.assertIn( + "The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception) + ) # quantized_model_from_saved = AutoModelForCausalLM.from_pretrained( # tmpdirname, torch_dtype=torch.float32, device_map="cpu" # )