Merge branch 'main' into add-aimv2-model

2025-07-31 10:12:23 +06:00 · 2025-04-30 19:29:58 +05:30 · 2025-04-30 19:29:58 +05:30 · b053947742
commit b053947742
parent cae9b80297 5b223bbc8c
785 changed files with 22509 additions and 4213 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
+    # will be adjust in `CircleCIJob.to_dict`.
+    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -126,6 +128,8 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
+        # Do not run tests decorated by @is_flaky on pull requests
+        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers-cli env
+transformers env
 ```

 You can also run the same command from the root of the repository:
--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py  --fix_and_overwrite
+	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -121,7 +121,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> transformers chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -293,7 +293,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
                max_cache_len=seq_length + 128,
            )

-            # 3nd call
+            # 3rd call
            start = perf_counter()
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers-cli env
+transformers env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -149,6 +149,8 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
+    - local: perf_train_gaudi
+      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -385,6 +387,8 @@
        title: BigBirdPegasus
      - local: model_doc/biogpt
        title: BioGpt
+      - local: model_doc/bitnet
+        title: BitNet
      - local: model_doc/blenderbot
        title: Blenderbot
      - local: model_doc/blenderbot-small
@ -497,6 +501,8 @@
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
+      - local: model_doc/hgnet_v2
+        title: HGNet-V2
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -689,6 +695,8 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
+      - local: model_doc/d_fine
+        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
      - local: model_doc/deformable_detr
@ -1015,6 +1023,8 @@
        title: Qwen2VL
      - local: model_doc/sam
        title: Segment Anything
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality
      - local: model_doc/shieldgemma2
        title: ShieldGemma2
      - local: model_doc/siglip
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+The `transformers add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers-cli add-fast-image-processor --model-name your_model_name
+transformers add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,12 +25,12 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers-cli
+## transformers CLI

 Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.

 ```bash
-transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+transformers chat Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
@ -40,7 +40,7 @@ transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 For a full list of options, run the command below.

 ```bash
-transformers-cli chat -h
+transformers chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -76,16 +76,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -107,9 +107,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -77,9 +77,9 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] TorchAoConfig

-## BitNetConfig
+## BitNetQuantConfig

-[[autodoc]] BitNetConfig
+[[autodoc]] BitNetQuantConfig

 ## SpQRConfig

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/bitnet.md
+++ b/docs/source/en/model_doc/bitnet.md
@ -0,0 +1,121 @@
+<!--Copyright 2025 The BitNet Team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BitNet
+
+## Overview
+
+Trained on a corpus of 4 trillion tokens, this model demonstrates that native 1-bit LLMs can achieve performance comparable to leading open-weight, full-precision models of similar size, while offering substantial advantages in computational efficiency (memory, energy, latency).
+
+➡️ **Technical Report:** [BitNet b1.58 2B4T Technical Report](https://arxiv.org/abs/2504.12285)
+
+➡️ **Official Inference Code:** [microsoft/BitNet (bitnet.cpp)](https://github.com/microsoft/BitNet)
+
+## Model Variants
+
+Several versions of the model weights are available on Hugging Face:
+
+* [**`microsoft/bitnet-b1.58-2B-4T`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T): Contains the packed 1.58-bit weights optimized for efficient inference. **Use this for deployment.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-bf16`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-bf16): Contains the master weights in BF16 format. **Use this only for training or fine-tuning purposes.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
+
+
+### Model Details
+
+
+* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
+    * Uses Rotary Position Embeddings (RoPE).
+    * Uses squared ReLU (ReLU²) activation in FFN layers.
+    * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
+    * No bias terms in linear or normalization layers.
+* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
+    * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
+    * Activations are quantized to 8-bit integers using absmax quantization (per-token).
+    * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
+* **Parameters:** ~2 Billion
+* **Training Tokens:** 4 Trillion
+*   **Context Length:** Maximum sequence length of **4096 tokens**.
+    *   *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
+* **Training Stages:**
+    1.  **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
+    2.  **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
+    3.  **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
+* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
+
+
+## Usage tips
+
+
+**VERY IMPORTANT NOTE ON EFFICIENCY**
+
+> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
+>
+> The current execution paths within transformers do not contain the specialized, highly optimized computational kernels required to leverage the advantages of the BitNet architecture. Running the model via transformers will likely result in inference speeds and energy usage comparable to, or potentially worse than, standard full-precision models within this framework on both CPU and GPU.
+>
+> While you might observe reduced memory usage due to the quantized weights, the primary computational efficiency benefits are not accessible through this standard transformers usage path.
+>
+> For achieving the efficiency benefits demonstrated in the technical paper, you MUST use the dedicated C++ implementation: [bitnet.cpp](https://github.com/microsoft/BitNet).
+
+### Requirements
+
+```bash
+pip install transformers
+```
+
+### Example
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "microsoft/bitnet-b1.58-2B-4T"
+
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16
+)
+
+# Apply the chat template
+messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {"role": "user", "content": "How are you?"},
+]
+chat_input = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+
+# Generate response
+chat_outputs = model.generate(chat_input, max_new_tokens=50)
+response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special_tokens=True) # Decode only the response part
+print("\nAssistant Response:", response)
+```
+
+
+## BitNetConfig
+
+[[autodoc]] BitNetConfig
+
+## BitNetModel
+
+[[autodoc]] BitNetModel
+    - forward
+
+## BitNetForCausalLM
+
+[[autodoc]] BitNetForCausalLM
+    - forward
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`

 <hfoptions id="usage">
 <hfoption id="Pipeline">
-    
+
 ```py
 import torch
 from transformers import pipeline
@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
 input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

 output = model.generate(
-    **input_ids, 
+    **input_ids,
    max_new_tokens=256,
    cache_implementation="static"
 )
@ -92,10 +92,10 @@ print(filled_text)
 ```

 </hfoption>
-<hfoption id="transformers-cli">
-    
+<hfoption id="transformers CLI">
+
 ```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
 ```

 </hfoption>
@ -146,7 +146,7 @@ visualizer("""def func(a, b):
 - Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
    ```py
    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-    
+
    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
    PROMPT = '''def remove_non_ascii(s: str) -> str:
@ -155,7 +155,7 @@ visualizer("""def func(a, b):
    '''
    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
    generated_ids = model.generate(input_ids, max_new_tokens=128)
-    
+
    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
    print(PROMPT.replace("<FILL_ME>", filling))
    ```
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
--- a/docs/source/en/model_doc/d_fine.md
+++ b/docs/source/en/model_doc/d_fine.md
@ -0,0 +1,76 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# D-FINE
+
+## Overview
+
+The D-FINE model was proposed in [D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement](https://arxiv.org/abs/2410.13842) by
+Yansong Peng, Hebei Li, Peixi Wu, Yueyi Zhang, Xiaoyan Sun, Feng Wu
+
+The abstract from the paper is the following:
+
+*We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). 
+FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: this https URL.*
+
+This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). 
+The original code can be found [here](https://github.com/Peterande/D-FINE).
+
+## Usage tips 
+
+```python
+>>> import torch
+>>> from transformers.image_utils import load_image
+>>> from transformers import DFineForObjectDetection, AutoImageProcessor
+
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = load_image(url)
+
+>>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine_x_coco")
+>>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine_x_coco")
+
+>>> inputs = image_processor(images=image, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> results = image_processor.post_process_object_detection(outputs, target_sizes=[(image.height, image.width)], threshold=0.5)
+
+>>> for result in results:
+...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+...         score, label = score.item(), label_id.item()
+...         box = [round(i, 2) for i in box.tolist()]
+...         print(f"{model.config.id2label[label]}: {score:.2f} {box}")
+cat: 0.96 [344.49, 23.4, 639.84, 374.27]
+cat: 0.96 [11.71, 53.52, 316.64, 472.33]
+remote: 0.95 [40.46, 73.7, 175.62, 117.57]
+sofa: 0.92 [0.59, 1.88, 640.25, 474.74]
+remote: 0.89 [333.48, 77.04, 370.77, 187.3]
+```
+
+## DFineConfig
+
+[[autodoc]] DFineConfig
+
+## DFineModel
+
+[[autodoc]] DFineModel
+    - forward
+
+## DFineForObjectDetection
+
+[[autodoc]] DFineForObjectDetection
+    - forward
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")

 </hfoption>

-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```

 </hfoption>
@ -213,7 +213,3 @@ echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task

 </jax>
 </frameworkcontent>
-
-
-
-
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@ -45,9 +45,9 @@ import torch
 from transformers import pipeline

 classifier = pipeline(
-    task="text-classification", 
-    model="bhadresh-savani/electra-base-emotion", 
-    torch_dtype=torch.float16, 
+    task="text-classification",
+    model="bhadresh-savani/electra-base-emotion",
+    torch_dtype=torch.float16,
    device=0
 )
 classifier("This restaurant has amazing food!")
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/electra-base-emotion",
 )
 model = AutoModelForSequenceClassification.from_pretrained(
-    "bhadresh-savani/electra-base-emotion", 
+    "bhadresh-savani/electra-base-emotion",
    torch_dtype=torch.float16
 )
 inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
 ```

 </hfoption>
@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers-cli run --task text-c

    ```py
    # Example of properly handling padding with attention masks
-    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
-                    padding=True, 
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
+                    padding=True,
                    return_tensors="pt")
    outputs = model(**inputs)  # automatically uses the attention_mask
    ```
-    
+
 - When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).

 ## ElectraConfig
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@ -41,7 +41,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    task="text-generation", 
+    task="text-generation",
    model="tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## FalconForQuestionAnswering

 [[autodoc]] FalconForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -39,7 +39,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    "text-generation", 
+    "text-generation",
    model="tiiuae/falcon-mamba-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -73,10 +73,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-transformers-cli chat --model_name_or_path tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
+transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -1,4 +1,5 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,31 +15,146 @@ rendered properly in your Markdown viewer.

 -->

-# Gemma
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Gemma

-The Gemma model was proposed in [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by Gemma Team, Google.
-Gemma models are trained on 6T tokens, and released with 2 versions, 2b and 7b.
+[Gemma](https://huggingface.co/papers/2403.08295) is a family of lightweight language models with pretrained and instruction-tuned variants, available in 2B and 7B parameters. The architecture is based on a transformer decoder-only design. It features Multi-Query Attention, rotary positional embeddings (RoPE), GeGLU activation functions, and RMSNorm layer normalization.

-The abstract from the paper is the following:
+The instruction-tuned variant was fine-tuned with supervised learning on instruction-following data, followed by reinforcement learning from human feedback (RLHF) to align the model outputs with human preferences.

-*This work introduces Gemma, a new family of open language models demonstrating strong performance across academic benchmarks for language understanding, reasoning, and safety. We release two sizes of models (2 billion and 7 billion parameters), and provide both pretrained and fine-tuned checkpoints. Gemma outperforms similarly sized open models on 11 out of 18 text-based tasks, and we present comprehensive evaluations of safety and responsibility aspects of the models, alongside a detailed description of our model development. We believe the responsible release of LLMs is critical for improving the safety of frontier models, and for enabling the next wave of LLM innovations*
+You can find all the original Gemma checkpoints under the [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) release.

-Tips:

- The original checkpoints can be converted using the conversion script `src/transformers/models/gemma/convert_gemma_weights_to_hf.py` 
+> [!TIP]
+> Click on the Gemma models in the right sidebar for more examples of how to apply Gemma to different language tasks.

-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi), [Pedro Cuenca](https://huggingface.co/pcuenq).
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.

+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+)
+
+pipeline("LLMs generate text through a process known as", max_new_tokens=50)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=50, cache_implementation="static")
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+#!pip install bitsandbytes
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-7b",
+    quantization_config=quantization_config,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as."
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(
+    **input_ids,
+    max_new_tokens=50,
+    cache_implementation="static"
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("google/gemma-2b")
+visualizer("LLMs generate text through a process known as")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/gemma-attn-mask.png"/>
+</div>
+
+## Notes
+
+- The original Gemma models support standard kv-caching used in many transformer-based language models. You can use use the default [`DynamicCache`] instance or a tuple of tensors for past key values during generation. This makes it compatible with typical autoregressive generation workflows.
+
+   ```py
+   import torch
+   from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+   tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+   model = AutoModelForCausalLM.from_pretrained(
+       "google/gemma-2b",
+       torch_dtype=torch.bfloat16,
+       device_map="auto",
+       attn_implementation="sdpa"
+   )
+   input_text = "LLMs generate text through a process known as"
+   input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+   past_key_values = DynamicCache()
+   outputs = model.generate(**input_ids, max_new_tokens=50, past_key_values=past_key_values)
+   print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+   ```

 ## GemmaConfig

--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -58,7 +58,7 @@ pipe("Explain quantum computing simply. ", max_new_tokens=50)

 </hfoption>
 <hfoption id="AutoModel">
-    
+
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@ -80,16 +80,16 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```
-echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model google/gemma-2-2b --device 0
+echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0
 ```
 </hfoption>
 </hfoptions>

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-	
+
 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

 ```python
@ -118,7 +118,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 ```python
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("You are an assistant. Make sure you print me") 
+visualizer("You are an assistant. Make sure you print me")
 ```

 <div class="flex justify-center">
@ -137,7 +137,7 @@ visualizer("You are an assistant. Make sure you print me")

    inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
    max_generated_length = inputs.input_ids.shape[1] + 10
-    past_key_values = HybridCache(config=model.config, max_batch_size=1, 
+    past_key_values = HybridCache(config=model.config, max_batch_size=1,
    max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
    outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    ```
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -99,10 +99,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -64,10 +64,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model openai-community/gpt2 --device 0
+echo -e "Hello, I'm a language model" | transformers run --task text-generation --model openai-community/gpt2 --device 0
 ```

 </hfoption>
@ -82,16 +82,16 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,  
-    bnb_4bit_quant_type="nf4",  
-    bnb_4bit_compute_dtype="float16",  
-    bnb_4bit_use_double_quant=True 
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype="float16",
+    bnb_4bit_use_double_quant=True
 )

 model = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2-xl",
    quantization_config=quantization_config,
-    device_map="auto"  
+    device_map="auto"
 )

 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
--- a/docs/source/en/model_doc/hgnet_v2.md
+++ b/docs/source/en/model_doc/hgnet_v2.md
@ -0,0 +1,46 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# HGNet-V2
+
+## Overview
+
+A HGNet-V2 (High Performance GPU Net) image classification model.
+HGNet arhtictecture was proposed in [HGNET: A Hierarchical Feature Guided Network for Occupancy Flow Field Prediction](https://arxiv.org/abs/2407.01097) by
+Zhan Chen, Chen Tang, Lu Xiong
+
+The abstract from the HGNET paper is the following:
+
+*Predicting the motion of multiple traffic participants has always been one of the most challenging tasks in autonomous driving. The recently proposed occupancy flow field prediction method has shown to be a more effective and scalable representation compared to general trajectory prediction methods. However, in complex multi-agent traffic scenarios, it remains difficult to model the interactions among various factors and the dependencies among prediction outputs at different time steps. In view of this, we propose a transformer-based hierarchical feature guided network (HGNET), which can efficiently extract features of agents and map information from visual and vectorized inputs, modeling multimodal interaction relationships. Second, we design the Feature-Guided Attention (FGAT) module to leverage the potential guiding effects between different prediction targets, thereby improving prediction accuracy. Additionally, to enhance the temporal consistency and causal relationships of the predictions, we propose a Time Series Memory framework to learn the conditional distribution models of the prediction outputs at future time steps from multivariate time series. The results demonstrate that our model exhibits competitive performance, which ranks 3rd in the 2024 Waymo Occupancy and Flow Prediction Challenge.*
+
+This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). 
+The original code can be found [here](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py).
+
+## HGNetV2Config
+
+[[autodoc]] HGNetV2Config
+
+
+## HGNetV2Backbone
+
+[[autodoc]] HGNetV2Backbone
+    - forward
+
+
+## HGNetV2ForImageClassification
+
+[[autodoc]] HGNetV2ForImageClassification
+    - forward
--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@ -75,10 +75,10 @@ output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model huggyllama/llama-7b --device 0
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model huggyllama/llama-7b --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-transformers-cli chat --model_name_or_path meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
+transformers chat meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -175,4 +175,3 @@ visualizer("Plants create energy through a process known as")

 [[autodoc]] LlamaForSequenceClassification
    - forward
-
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -76,10 +76,10 @@ tokenizer.decode(predictions).split()
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
+echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers run --task fill-mask --model allenai/longformer-base-4096 --device 0
 ```

 </hfoption>
@ -147,42 +147,42 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## LongformerForMaskedLM

-[[autodoc]] LongformerForMaskedLM 
+[[autodoc]] LongformerForMaskedLM
    - forward

 ## LongformerForSequenceClassification

-[[autodoc]] LongformerForSequenceClassification 
+[[autodoc]] LongformerForSequenceClassification
    - forward

 ## LongformerForMultipleChoice

-[[autodoc]] LongformerForMultipleChoice 
+[[autodoc]] LongformerForMultipleChoice
    - forward

 ## LongformerForTokenClassification

-[[autodoc]] LongformerForTokenClassification 
+[[autodoc]] LongformerForTokenClassification
    - forward

 ## LongformerForQuestionAnswering

-[[autodoc]] LongformerForQuestionAnswering 
+[[autodoc]] LongformerForQuestionAnswering
    - forward

 ## TFLongformerModel

-[[autodoc]] TFLongformerModel    
+[[autodoc]] TFLongformerModel
    - call

 ## TFLongformerForMaskedLM

-[[autodoc]] TFLongformerForMaskedLM 
+[[autodoc]] TFLongformerForMaskedLM
    - call

 ## TFLongformerForQuestionAnswering

-[[autodoc]] TFLongformerForQuestionAnswering 
+[[autodoc]] TFLongformerForQuestionAnswering
    - call

 ## TFLongformerForSequenceClassification
@ -192,10 +192,10 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## TFLongformerForTokenClassification

-[[autodoc]] TFLongformerForTokenClassification 
+[[autodoc]] TFLongformerForTokenClassification
    - call

 ## TFLongformerForMultipleChoice

-[[autodoc]] TFLongformerForMultipleChoice 
+[[autodoc]] TFLongformerForMultipleChoice
    - call
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.

 # Mistral

-[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing 
+[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing
 the scaling costs of large models with performance and efficient inference. This model uses sliding window attention (SWA) trained with a 8K context length and a fixed cache size to handle longer sequences more effectively. Grouped-query attention (GQA) speeds up inference and reduces memory requirements. Mistral also features a byte-fallback BPE tokenizer to improve token handling and efficiency by ensuring characters are never mapped to out-of-vocabulary tokens.

 You can find all the original Mistral checkpoints under the [Mistral AI_](https://huggingface.co/mistralai) organization.
@ -78,10 +78,10 @@ The example below demonstrates how to chat with [`Pipeline`] or the [`AutoModel`
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```python
-echo -e "My favorite condiment is" | transformers-cli chat --model_name_or_path mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
+echo -e "My favorite condiment is" | transformers chat mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
 ```

 </hfoption>
--- a/docs/source/en/model_doc/mobilebert.md
+++ b/docs/source/en/model_doc/mobilebert.md
@ -76,10 +76,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "The capital of France is [MASK]." | transformers-cli run --task fill-mask --model google/mobilebert-uncased --device 0
+echo -e "The capital of France is [MASK]." | transformers run --task fill-mask --model google/mobilebert-uncased --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/mobilenet_v1.md
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@ -77,6 +77,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] MobileNetV1ImageProcessor
    - preprocess

+## MobileNetV1ImageProcessorFast
+
+[[autodoc]] MobileNetV1ImageProcessorFast
+    - preprocess
+
 ## MobileNetV1Model

 [[autodoc]] MobileNetV1Model
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@ -79,10 +79,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model answerdotai/ModernBERT-base --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model answerdotai/ModernBERT-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@ -70,10 +70,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "The future of AI is" | transformers-cli run --task text-generation --model openai-community/openai-gpt --device 0
+echo -e "The future of AI is" | transformers run --task text-generation --model openai-community/openai-gpt --device 0

 ```
 </hfoption>
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -65,10 +65,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
+echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers run --task text-classification --model microsoft/phi-1.5 --device 0
 ```

 </hfoption>
@ -102,7 +102,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```py
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM
-    
+
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-1",
@ -110,12 +110,12 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="sdpa")
-    
+
    input_ids = tokenizer('''def print_prime(n):
       """
       Print all primes between 1 and n
       """''', return_tensors="pt").to("cuda")
-    
+
    output = model.generate(**input_ids, cache_implementation="static")
    print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```
--- a/docs/source/en/model_doc/poolformer.md
+++ b/docs/source/en/model_doc/poolformer.md
@ -73,6 +73,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] PoolFormerImageProcessor
    - preprocess

+## PoolFormerImageProcessorFast
+
+[[autodoc]] PoolFormerImageProcessorFast
+    - preprocess
+
 ## PoolFormerModel

 [[autodoc]] PoolFormerModel
--- a/docs/source/en/model_doc/pvt.md
+++ b/docs/source/en/model_doc/pvt.md
@ -64,6 +64,11 @@ This model was contributed by [Xrenya](https://huggingface.co/Xrenya). The origi
 [[autodoc]] PvtImageProcessor
    - preprocess

+## PvtImageProcessorFast
+
+[[autodoc]] PvtImageProcessorFast
+    - preprocess
+
 ## PvtForImageClassification

 [[autodoc]] PvtForImageClassification
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -64,7 +64,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
-    torch_dtype=torch.bfloat16, 
+    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -86,10 +86,10 @@ generated_ids = model.generate(
    model_inputs.input_ids,
    cache_implementation="static",
    max_new_tokens=512,
-    do_sample=True, 
-    temperature=0.7, 
-    top_k=50,        
-    top_p=0.95       
+    do_sample=True,
+    temperature=0.7,
+    top_k=50,
+    top_p=0.95
 )
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
@ -100,11 +100,11 @@ print(response)
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers chat Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -121,21 +121,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16, 
-    bnb_4bit_quant_type="nf4",             
-    bnb_4bit_use_double_quant=True,       
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
 )

-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") 
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")
 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2" 
+    attn_implementation="flash_attention_2"
 )

-inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") 
+inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda")
 outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
--- a/docs/source/en/model_doc/qwen2_5_omni.md
+++ b/docs/source/en/model_doc/qwen2_5_omni.md
@ -59,7 +59,7 @@ model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
 )
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

-conversation = [
+conversations = [
    {
        "role": "system",
        "content": [
@ -115,7 +115,7 @@ model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
 )
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

-conversation = [
+conversations = [
    {
        "role": "system",
        "content": [
--- a/docs/source/en/model_doc/qwen2_5_vl.md
+++ b/docs/source/en/model_doc/qwen2_5_vl.md
@ -118,7 +118,7 @@ The example below uses [torchao](../quantization/torchao) to only quantize the w

 ```python
 import torch
-from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor
+from transformers import TorchAoConfig, Qwen2_5_VLForConditionalGeneration, AutoProcessor

 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
--- a/docs/source/en/model_doc/sam_hq.md
+++ b/docs/source/en/model_doc/sam_hq.md
@ -0,0 +1,127 @@
+# SAM-HQ
+
+## Overview
+
+SAM-HQ (High-Quality Segment Anything Model) was proposed in [Segment Anything in High Quality](https://arxiv.org/pdf/2306.01567.pdf) by Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu.
+
+The model is an enhancement to the original SAM model that produces significantly higher quality segmentation masks while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability.
+
+![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
+
+
+SAM-HQ introduces several key improvements over the original SAM model:
+
+1. High-Quality Output Token: A learnable token injected into SAM's mask decoder for higher quality mask prediction
+2. Global-local Feature Fusion: Combines features from different stages of the model for improved mask details
+3. Training Data: Uses a carefully curated dataset of 44K high-quality masks instead of SA-1B
+4. Efficiency: Adds only 0.5% additional parameters while significantly improving mask quality
+5. Zero-shot Capability: Maintains SAM's strong zero-shot performance while improving accuracy
+
+The abstract from the paper is the following:
+
+*The recent Segment Anything Model (SAM) represents a big leap in scaling up segmentation models, allowing for powerful zero-shot capabilities and flexible prompting. Despite being trained with 1.1 billion masks, SAM's mask prediction quality falls short in many cases, particularly when dealing with objects that have intricate structures. We propose HQ-SAM, equipping SAM with the ability to accurately segment any object, while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability. Our careful design reuses and preserves the pre-trained model weights of SAM, while only introducing minimal additional parameters and computation. We design a learnable High-Quality Output Token, which is injected into SAM's mask decoder and is responsible for predicting the high-quality mask. Instead of only applying it on mask-decoder features, we first fuse them with early and final ViT features for improved mask details. To train our introduced learnable parameters, we compose a dataset of 44K fine-grained masks from several sources. HQ-SAM is only trained on the introduced dataset of 44k masks, which takes only 4 hours on 8 GPUs.*
+
+Tips:
+
+- SAM-HQ produces higher quality masks than the original SAM model, particularly for objects with intricate structures and fine details
+- The model predicts binary masks with more accurate boundaries and better handling of thin structures
+- Like SAM, the model performs better with input 2D points and/or input bounding boxes
+- You can prompt multiple points for the same image and predict a single high-quality mask
+- The model maintains SAM's zero-shot generalization capabilities
+- SAM-HQ only adds ~0.5% additional parameters compared to SAM
+- Fine-tuning the model is not supported yet
+
+This model was contributed by [sushmanth](https://huggingface.co/sushmanth).
+The original code can be found [here](https://github.com/SysCV/SAM-HQ).
+
+Below is an example on how to run mask generation given an image and a 2D point:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import SamHQModel, SamHQProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
+processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+input_points = [[[450, 600]]]  # 2D location of a window in the image
+
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+You can also process your own masks alongside the input images in the processor to be passed to the model:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import SamHQModel, SamHQProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
+processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
+input_points = [[[450, 600]]]  # 2D location of a window in the image
+
+inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+scores = outputs.iou_scores
+```
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM-HQ:
+
+- Demo notebook for using the model (coming soon)
+- Paper implementation and code: [SAM-HQ GitHub Repository](https://github.com/SysCV/SAM-HQ)
+
+## SamHQConfig
+
+[[autodoc]] SamHQConfig
+
+## SamHQVisionConfig
+
+[[autodoc]] SamHQVisionConfig
+
+## SamHQMaskDecoderConfig
+
+[[autodoc]] SamHQMaskDecoderConfig
+
+## SamHQPromptEncoderConfig
+
+[[autodoc]] SamHQPromptEncoderConfig
+
+## SamHQProcessor
+
+[[autodoc]] SamHQProcessor
+
+## SamHQVisionModel
+
+[[autodoc]] SamHQVisionModel
+
+
+## SamHQModel
+
+[[autodoc]] SamHQModel
+    - forward
--- a/docs/source/en/model_doc/siglip2.md
+++ b/docs/source/en/model_doc/siglip2.md
@ -14,225 +14,160 @@ rendered properly in your Markdown viewer.

 -->

-# SigLIP2
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

+# SigLIP2
+
 ## Overview

-The SigLIP2 model was proposed in [SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features](https://huggingface.co/papers/2502.14786) by Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin,
-Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier Hénaff, Jeremiah Harmsen,
-Andreas Steiner and Xiaohua Zhai.
+[SigLIP2](https://huggingface.co/papers/2502.14786) is a family of multilingual vision-language encoders that builds on the [SigLIP](./siglip) training recipe. It includes decoder-based pretraining, self-distillation, and masked prediction to improve dense prediction tasks (segmentation, depth estimation, etc.). This model is available in two variants:

-The model comes in two variants
+- NaFlex supports different resolutions and maintains the native image aspect ratio
+- FixRes supports fixed resolutions and is backwards compatible with [SigLIP](./siglip)

- 1) FixRes - model works with fixed resolution images (backward compatible with SigLIP v1)
- 2) NaFlex - model works with variable image aspect ratios and resolutions (SigLIP2 in `transformers`)

-The abstract from the paper is the following:
+You can find all the original SigLIP2 checkpoints under the [SigLIP2](https://huggingface.co/collections/google/siglip2-67b5dcef38c175486e240107) collection.

-*We introduce SigLIP 2, a family of new multilingual vision-language encoders that build on the success
-of the original SigLIP. In this second iteration, we extend the original image-text training objective with
-several prior, independently developed techniques into a unified recipe—this includes decoder-based
-pretraining, self-supervised losses (self-distillation, masked prediction) and online data curation. With
-these changes, SigLIP 2 models outperform their SigLIP counterparts at all model scales in core capabilities, 
-including zero-shot classification (best SigLIP 2 ViT-g/16 achieves 85.0% ImageNet zero-shot
-accuracy), image-text retrieval, and transfer performance when extracting visual representations for
-Vision-Language Models (VLMs). Furthermore, the new training recipe leads to significant improvements 
-on localization and dense prediction tasks. We also train variants which support multiple resolutions 
-and preserve the input’s native aspect ratio. Finally, we train on a more diverse data-mixture that
-includes de-biasing techniques, leading to much better multilingual understanding and improved fair-
-ness. To provide users with the ability to trade-off inference cost with performance, we release model
-checkpoints at four sizes (ViT-B/86M, L/303M, So400m/400M, and g/1B).*
+> [!TIP]
+> Click on the SigLIP2 models in the right sidebar for more examples of how to apply SigLIP2 to different image and text tasks.

-## Usage tips
+The example below demonstrates zero-shot classification with [`Pipeline`] or the [`AutoModel`] class.

- Usage of SigLIP2 is similar to [SigLIP](siglip) and [CLIP](clip). The main difference from CLIP is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
- Model was trained with *lowercased* text, make sure you make the same preprocessing for your text labels.
- To get the same results as the pipeline, a prompt template of "this is a photo of {label}" should be used.
- The NaFlex variant supports processing images at higher resolutions by adjusting the `max_num_patches` parameter in the `Processor`. The default value is `max_num_patches=256`. Increasing `max_num_patches` to 1024 (4x) will approximately double processed image height and width, while preserving the aspect ratio.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip2_metrics_table.png"
-alt="drawing" width="600"/>
+```py
+import torch
+from transformers import pipeline

-This model was contributed by [qubvel](https://huggingface.co/qubvel-hf).
-The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

-## Usage example
-
-There are 2 main ways to use SigLIP2: either using the pipeline API, which abstracts away all the complexity for you, or by using the `Siglip2Model` class yourself.
-
-### FixRes variant
-
-**Pipeline API**
-
-The pipeline allows to use the model in a few lines of code:
-
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
-
->>> # load pipe
->>> image_classifier = pipeline(
-...     task="zero-shot-image-classification",
-...     model="google/siglip2-base-patch16-224",
-... )
-
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # inference
->>> candidate_labels = ["2 cats", "a plane", "a remote"]
->>> outputs = image_classifier(image, candidate_labels=candidate_labels)
->>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
->>> print(outputs)
-[{'score': 0.1499, 'label': '2 cats'}, {'score': 0.0008, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+pipeline = pipeline(task="zero-shot-image-classification", model="google/siglip2-base-patch16-224", device=0, torch_dtype=torch.bfloat16)
+pipeline(image, candidate_labels=candidate_labels)
 ```

-**Using the model yourself**
+</hfoption>
+<hfoption id="AutoModel (FixRes)">

-If you want to do the pre- and postprocessing yourself, here's how to do that:
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+model = AutoModel.from_pretrained("google/siglip2-base-patch16-224", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

 # IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt")
+inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt").to("cuda")

->>> with torch.no_grad():
-...     outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-15.0% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-### NaFlex variant
+</hfoption>
+<hfoption id="AutoModel (NaFlex)">

-NaFlex combines ideas from FlexiViT, i.e. supporting multiple, predefined sequence lengths 
-with a single ViT model, and NaViT, namely processing images at their native aspect ratio.
-This enables processing different types of images at appropriate resolution, e.g. using a
-larger resolution to process document images, while at the same time minimizing the impact 
-of aspect ratio distortion on certain inference tasks, e.g. on OCR.
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-Given a patch size and target sequence length, NaFlex preprocesses the data by first resizing 
-the input image such that the height and width after resizing are multiples of the patch size,
-while 
-    
-    1. keeping the aspect ratio distortion as small as possible
-    2. producing a sequence length of at most the desired target sequence length (`max_num_patches`)
-    
-The resulting distortion in width and height is at most `(patch_size - 1) / width` and
-`(patch_size - 1) / height`, respectively, which tends to be small for common resolutions and aspect ratios. 
-After resizing, the image is split into a sequence of patches, and a mask with padding information is added.
+model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")
+# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing higher values e.g. `max_num_patches=512`
+inputs = processor(text=texts, images=image, padding="max_length", max_num_patches=256, return_tensors="pt").to("cuda")

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModel.from_pretrained("google/siglip2-large-patch16-512", quantization_config=bnb_config, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

->>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

-# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing
-# higher values e.g. `max_num_patches=512`
->>> inputs = processor(text=texts, images=image, max_num_patches=256, return_tensors="pt")
+# IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
+inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt").to("cuda")

->>> with torch.no_grad():
-...     outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-21.1% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-## Resources
+## Notes

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP2.
+- Training is supported for DDP and FSDP on single-node multi-GPU setups. However, it does not use [torch.distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) utilities which may limit the scalability of batch size.
+- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
+- Model was trained with *lowercased* text, so make sure your text labels are preprocessed the same way.
+- To get the same results as the [`Pipeline`], a prompt template of `"This is a photo of {label}."` should be passed to the processor.
+- The NaFlex variant processes different types of images at the appropriate resolution (using a larger resolution to process document images for example), while also minimizing the impact of aspect ratio distortion for certain inference tasks like OCR.

- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
- Demo notebook for SigLIP2 can be found [here](https://github.com/qubvel/transformers-notebooks/tree/master/notebooks/SigLIP2_inference.ipynb). 🌎
+   NaFlex resizes the input image so the height and width are multiples of the patch size after resizing. It keeps the aspect ratio distortion as low as possible and produces a sequence length of at most the desired target sequence length (`max_num_patches`). After resizing, the image is split into a sequence of patches and a mask with padding information is added.
+- Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention.
+    ```py
+    # pip install -U flash-attn --no-build-isolation

-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-
-## Combining SigLIP2 and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> import requests
->>> from PIL import Image
->>> from transformers import AutoProcessor, AutoModel
->>> device = "cuda" # the device to load the model onto
-
->>> model = AutoModel.from_pretrained(
-...     "google/siglip2-so400m-patch14-384",
-...     attn_implementation="flash_attention_2",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
->>> processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
-```
+    from transformers import SiglipModel

+    model = SiglipModel.from_pretrained(
+        "google/siglip2-so400m-patch14-384",
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.float16,
+        device_map=device,
+    )
+    ```
 ## Siglip2Config

 [[autodoc]] Siglip2Config
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@ -75,10 +75,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "translate English to French: The weather is nice today." | transformers-cli run --task text2text-generation --model google-t5/t5-base --device 0
+echo -e "translate English to French: The weather is nice today." | transformers run --task text2text-generation --model google-t5/t5-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/vitmatte.md
+++ b/docs/source/en/model_doc/vitmatte.md
@ -53,6 +53,11 @@ The model expects both the image and trimap (concatenated) as input. Use [`ViTMa
 [[autodoc]] VitMatteImageProcessor
    - preprocess

+## VitMatteImageProcessorFast
+
+[[autodoc]] VitMatteImageProcessorFast
+    - preprocess
+
 ## VitMatteForImageMatting

 [[autodoc]] VitMatteForImageMatting
--- a/docs/source/en/perf_train_gaudi.md
+++ b/docs/source/en/perf_train_gaudi.md
@ -0,0 +1,34 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Intel Gaudi
+
+The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) overview.
+
+[`TrainingArguments`], [`Trainer`] and [`Pipeline`] detect and set the backend device to `hpu` if an Intel Gaudi device is available. No additional changes are required to enable training and inference on your device.
+
+Some modeling code in Transformers is not optimized for HPU lazy mode. If you encounter any errors, set the environment variable below to use eager mode:
+```
+PT_HPU_LAZY_MODE=0
+```
+
+In some cases, you'll also need to enable int64 support to avoid casting issues with long integers:
+```
+PT_ENABLE_INT64_SUPPORT=1
+```
+Refer to the [Gaudi docs](https://docs.habana.ai/en/latest/index.html) for more details.
+
+> [!TIP]
+> For training and inference with Gaudi-optimized model implementations, we recommend using [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index).
--- a/docs/source/en/quantization/auto_round.md
+++ b/docs/source/en/quantization/auto_round.md
@ -184,7 +184,7 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
 ```

-<hfoption>
+</hfoption>

 <hfoption id="inference xpu">

@ -203,7 +203,7 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
 ```

-<hfoption>
+</hfoption>

 <hfoption id="inference cuda">

@ -222,7 +222,7 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
 ```

-<hfoption>
+</hfoption>

 <hfoption id="inference backend">

@ -245,7 +245,7 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
 ```

-<hfoption>
+</hfoption>


 <hfoption id="format convert">
@ -266,9 +266,9 @@ inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
 ```

-<hfoption>
+</hfoption>

-<hfoptions>
+</hfoptions>

 ## Issues

--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -149,7 +149,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # or int8 weight only quantization
@ -179,7 +179,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int4WeightOnlyConfig
+from torchao.quantization import GemliteUIntXWeightOnlyConfig

 # For batch size N, we recommend gemlite, which may require autotuning
 # default is 4 bit, 8 bit is also supported by passing `bit_width=8`
@ -216,7 +216,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # quant_config = Int8WeightOnlyConfig()
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@ -160,7 +160,48 @@ outputs[0]["generated_text"]
 #  with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
 ```

-## Streaming
+If you prefer, you can also load the images separately and pass them to the pipeline like so:
+
+```python
+pipe = pipeline("image-text-to-text", model="HuggingFaceTB/SmolVLM-256M-Instruct")
+
+img_urls = [
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+]
+images = [
+    Image.open(requests.get(img_urls[0], stream=True).raw),
+    Image.open(requests.get(img_urls[1], stream=True).raw),
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "image"},
+            {"type": "text", "text": "What do you see in these images?"},
+        ],
+    }
+]
+outputs = pipe(text=messages, images=images, max_new_tokens=50, return_full_text=False)
+outputs[0]["generated_text"]
+" In the first image, there are two cats sitting on a plant. In the second image, there are flowers with a pinkish hue."
+```
+
+The images will still be included in the `"input_text"` field of the output:
+
+```python
+outputs[0]['input_text']
+"""
+[{'role': 'user',
+  'content': [{'type': 'image',
+    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=622x412>},
+   {'type': 'image',
+    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=5184x3456>},
+   {'type': 'text', 'text': 'What do you see in these images?'}]}]## Streaming
+"""
+```

 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.

--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@ -20,9 +20,9 @@ Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en in

 <Tip>

-Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.
+Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers**) disponible en cualquier instalación de transformers >= 2.3.0.

-La siguiente documentación refleja el formato para el comando **transformers-cli convert**.
+La siguiente documentación refleja el formato para el comando **transformers convert**.

 </Tip>

@ -41,7 +41,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pr
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-transformers-cli convert --model_type bert \
+transformers convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -60,7 +60,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entr
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base

-transformers-cli convert --model_type albert \
+transformers convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -75,7 +75,7 @@ Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-transformers-cli convert --model_type gpt \
+transformers convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -89,7 +89,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entre
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers-cli convert --model_type gpt2 \
+transformers convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -104,7 +104,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-transformers-cli convert --model_type xlnet \
+transformers convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -118,7 +118,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-transformers-cli convert --model_type xlm \
+transformers convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -132,7 +132,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12

-transformers-cli convert --model_type t5 \
+transformers convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/it/add_new_model.md
+++ b/docs/source/it/add_new_model.md
@ -15,51 +15,51 @@ rendered properly in your Markdown viewer.

 # Come aggiungere un modello a 🤗 Transformers?

-Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche 
-della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere 
-modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo 
-creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con 
+Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche
+della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere
+modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo
+creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con
 questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare
 un modello in 🤗 Transformers.

 Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md)
-e contattarci. 
+e contattarci.

 Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗
-Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, 
+Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre,
 sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di:

 - ottenere più comprensione delle best practices in open-source
- capire i principi di design di una della librerie NLP più popolari 
+- capire i principi di design di una della librerie NLP più popolari
 - capire come efficientemente testare complessi modelli NLP
- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito 
+- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito

-Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. 
+Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”.
 Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli
 già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed),
-per capire se richiamano il modello che vorreste aggiungere. 
+per capire se richiamano il modello che vorreste aggiungere.

 Per cominciare, vediamo una panoramica general della libreria Transformers.

 ## Panoramica generale su 🤗 Transformers

 Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi
-puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, 
+puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza,
 tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente
-su larga scala, mantenendo i costi a un livello accettabile.  
+su larga scala, mantenendo i costi a un livello accettabile.

 Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia)
 Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli:

 - La composizione é generalmente favorita sulla sovra-astrazione
 - Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello
- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice 
+- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice
 di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze.


-La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità 
-di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi 
-un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, 
+La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità
+di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi
+un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno,
 cercheranno di capire e modificare il tuo modello.

 Tenendo questi principi in mente, immergiamoci nel design generale della libreria.
@ -67,25 +67,25 @@ Tenendo questi principi in mente, immergiamoci nel design generale della libreri
 ### Panoramica sui modelli

 Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione,
-[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers  
+[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers
 `BrandNewBert`.

 Diamo un'occhiata:

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>

-Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo 
-assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita 
-da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no? 
+Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo
+assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita
+da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no?
 Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità
 importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`]
-e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti 
+e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti
 funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script
-`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come 
+`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come
 `BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel`
-come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni 
-nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre 
-mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` 
+come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni
+nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre
+mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config`
 per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`:

 ```python
@ -93,35 +93,35 @@ model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
 model.config  # il modello ha accesso al suo config
 ```

-Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da 
-[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - 
-il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando 
-[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il 
+Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da
+[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti -
+il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando
+[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il
 modello che la configurazione siano salvati.


 ### Stile per il codice

-Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò 
+Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò
 ci sono alcuni fatti da considerare su come scrivere un codice :-)

-1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente 
+1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente
   da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
   per un ottimo esempio).
-2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le 
-   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio 
+2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le
+   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio
   che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop.
 3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico.
-4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché 
-   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. 
-5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile 
+4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché
+   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points.
+5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile
   piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice.

 ### Panoramica sui tokenizers

 Questa sezione sarà creata al piu presto :-(

-## Aggiungere un modello a 🤗 Transformers passo dopo passo 
+## Aggiungere un modello a 🤗 Transformers passo dopo passo

 Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello:

@ -141,11 +141,11 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 -  1. ☐ (Opzionale) Capire gli aspetti teorici del modello
 -  2. ☐ Preparare l'ambiente dev per transformers
-  3. ☐ Preparare l'ambiente debugging della repository originale 
-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint 
+-  3. ☐ Preparare l'ambiente debugging della repository originale
+-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint
 -  5. ☐ Aggiungere con successo lo scheletro del modello a Transformers
 -  6. ☐ Convertire i checkpoint original a Transformers checkpoint
-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale 
+-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale
 -  8. ☐ Finire i tests per il modello in Transformers
 -  9. ☐ Aggiungere con successo Tokenizer in Transformers
 -  10. ☐ Testare e provare gli integration tests da capo a fine
@ -156,22 +156,22 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici.

-### 1. (Opzionale) Aspetti teorici di BrandNewBert 
+### 1. (Opzionale) Aspetti teorici di BrandNewBert

 Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia:

- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli 
- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? 
- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? 
+- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli
+- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq?
+- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART?
 - Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*?
- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? 
+- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART?

-Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) 
+Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :)


 ### 2. Prepare il tuo ambiente

-1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub 
+1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub

 2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota:

@ -190,7 +190,7 @@ source .env/bin/activate
 pip install -e ".[dev]"
 ```

-quindi torna alla directory principale: 
+quindi torna alla directory principale:

 ```bash
 cd ..
@ -205,7 +205,7 @@ cd ..
 5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale:

 ```bash
-git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
 cd brand_new_bert
 pip install -e .
 ```
@ -213,16 +213,16 @@ pip install -e .
 Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers.


-### 3.-4. Provare un pretrained checkpoint usando la repo originale 
+### 3.-4. Provare un pretrained checkpoint usando la repo originale

-Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. 
+Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**.

 Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti:

- Dove si trovano i pretrained weights? 
- Come caricare i pretrained weights nel modello corrispondente? 
- Come girare un tokenizer independentemente dal modello? 
- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta 
+- Dove si trovano i pretrained weights?
+- Come caricare i pretrained weights nel modello corrispondente?
+- Come girare un tokenizer independentemente dal modello?
+- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta
 - Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...?
 - Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm?

@ -230,14 +230,14 @@ Riuscire a far girare il modello pretrained originale dalla repository ufficiale

 A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in  🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU.

-In generale ci sono due possibili ambienti di debug per il testare il modello originale: 
+In generale ci sono due possibili ambienti di debug per il testare il modello originale:

 - [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
- Scripts locali in Python 
+- Scripts locali in Python

 Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera.

-Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. 
+Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`.

 Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é:

@ -249,42 +249,42 @@ original_output = model.predict(input_ids)

 Per quanto riguarda la strategia di debugging, si può scegliere tra:

- Decomporre il modello originario in piccole componenenti e testare ognuna di esse 
- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, 
+- Decomporre il modello originario in piccole componenenti e testare ognuna di esse
+- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi,
 e usare dei print statement o breakpoints intermedi per verificare

-Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu 
+Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu
 avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario.

-Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base 
-originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. 
-Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: 
+Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base
+originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere.
+Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito:

 - negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare
 automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1
 - avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro
- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore 
-comprensione del modello stesso 
+- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore
+comprensione del modello stesso
 - verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro
 nell'implementazione, così da continuare la modifica del codice senza interruzione

-Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) 
+Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)
 per il modello ELECTRA

-Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite 
-compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. 
-Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria 
-é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare 
+Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite
+compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti.
+Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria
+é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare
 affidamento ai print statements.

-In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal 
-primo layer al layer finale. 
+In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal
+primo layer al layer finale.
 É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine:

 1. Recuperare gli IDs di input dati al modello
 2. Recuperare i word embeddings
-3. Recuperare l'input del primo Transformer layer 
-4. Recuperare l'output del primo Transformer layer 
+3. Recuperare l'input del primo Transformer layer
+4. Recuperare l'output del primo Transformer layer
 5. Recuperare l'output dei seguenti `n - 1` Transformer layers
 6. Recuperare l'output dell'intero BrandNewBert Model

@ -303,36 +303,36 @@ Gli output dei seguenti layer di solito dovrebbero essere degli array di float m
 [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
 ```

-Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo 
-significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione 
-di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente 
-diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, 
-é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del 
+Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo
+significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione
+di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente
+diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque,
+é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del
 modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente
 possibile:

 - Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch?
-Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il 
-modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale 
-é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) 
-per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** 
-quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). 
- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro 
-ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. 
+Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il
+modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale
+é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)
+per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit**
+quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196).
+- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro
+ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi.
 Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere
-a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers 
+a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers
 con il vostro modello
- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare 
-la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata 
-`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici 
+- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare
+la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata
+`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici
 volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`.
- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare 
-come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il 
-debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare 
-gli input al modello, anziche delle stringhe in input. 
- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per 
-via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche 
-i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione 
+- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare
+come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il
+debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare
+gli input al modello, anziche delle stringhe in input.
+- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per
+via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche
+i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione
 sono nello stesso framework.

 La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*.
@ -343,7 +343,7 @@ La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare
 Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers:


-```bash 
+```bash
 cd transformers
 ```

@ -355,52 +355,52 @@ Se questo non é il caso, cominciamo con il generare un nuovo modello. Ti consig
 un modello esistente:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 Ti verrà richiesto con un questionario di compilare le informazioni di base del tuo modello.

 **Aprire una Pull Request in main huggingface/transformers repo**

-Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", 
+Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)",
 *per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell'
 integrare il modello in 🤗 Transformers.

 Questi sarebbero gli step generali da seguire:

-1. Creare un branch dal main branch con un nome descrittivo 
+1. Creare un branch dal main branch con un nome descrittivo

-```bash 
-git checkout -b add_brand_new_bert 
+```bash
+git checkout -b add_brand_new_bert
 ```

-2. Commit del codice automaticamente generato 
+2. Commit del codice automaticamente generato

-```bash 
-git add . 
-git commit 
+```bash
+git add .
+git commit
 ```

 3. Fare fetch e rebase del main esistente

-```bash 
-git fetch upstream 
-git rebase upstream/main 
+```bash
+git fetch upstream
+git rebase upstream/main
 ```

-4. Push dei cambiamenti al proprio account: 
+4. Push dei cambiamenti al proprio account:

 ```bash
 git push -u origin a-descriptive-name-for-my-changes
 ```

-5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". 
-Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team 
-Hugging Face verrà notificato anche per i futuri cambiamenti. 
+5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request".
+Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team
+Hugging Face verrà notificato anche per i futuri cambiamenti.

 6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR

-Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, 
+Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre,
 ricordatevi di tenere aggiornato il vostro lavoro con il main esistente:

 ```bash
@ -408,39 +408,39 @@ git fetch upstream
 git merge upstream/main
 ```

-In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR 
-e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit 
-di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento 
-nella domanda, cosicche il team potra facilmente capire il problema o la domanda. 
+In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR
+e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit
+di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento
+nella domanda, cosicche il team potra facilmente capire il problema o la domanda.

-Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea 
-dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema 
+Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea
+dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema
 é stato risolto, cliccate sul bottone "Resolve".

-In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più 
-domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi 
+In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più
+domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi
 di chiedere al team Hugging Face direttamente su slack o email.


 **5. Adattare i codici per brand_new_bert**

-Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in  
+Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in
 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e
 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.

-Ora potete finalmente cominciare il codice :). Il codice generato in 
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un 
-modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo 
-agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare 
-questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... 
-Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere 
-un'idea migliore su come implementare il modello. 
+Ora potete finalmente cominciare il codice :). Il codice generato in
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un
+modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo
+agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare
+questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via...
+Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere
+un'idea migliore su come implementare il modello.

-**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un 
-codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 
-fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza 
-del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente 
-instanza: 
+**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un
+codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`
+fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza
+del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente
+instanza:

 ```python
 from transformers import BrandNewBertModel, BrandNewBertConfig
@ -448,23 +448,23 @@ from transformers import BrandNewBertModel, BrandNewBertConfig
 model = BrandNewBertModel(BrandNewBertConfig())
 ```

-Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce 
+Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce
 che `init()` di tutte le componenti funzioni correttamente.


 **6. Scrivere uno script di conversione**

-Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella 
-repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere 
+Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella
+repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere
 lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne
-uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. 
+uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso.
 Non esistate a chiedre al team di Hugging Face a riguardo.

 - Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
 - Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)

-Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, 
-il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, 
+Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch,
+il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch,
 chiamato `SimpleModel`:

 ```python
@ -497,7 +497,7 @@ SimpleModel(
 )
 ```

-Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno 
+Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno
 specifico layer possono essere visualizzati:


@ -530,7 +530,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
          0.2220,  0.2358]]).
 ```

-Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente 
+Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente
 layer nel checkpoint. *Per esempio*

 ```python
@ -544,8 +544,8 @@ model_pointer = getattr(model, "dense")
 model_pointer.weight.data = torch.from_numpy(pretrained_weight)
 ```

-Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint 
-siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` 
+Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint
+siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert`
 per la dimensione/shape e nome:

 ```python
@ -560,19 +560,19 @@ Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri ch
 logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
 ```

-Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di 
+Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di
 🤗 Transformers.

-Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche 
-che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. 
+Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche
+che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights.

-Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati 
-usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano 
-errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, 
-o un bug in `init()`. 
+Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati
+usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano
+errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers,
+o un bug in `init()`.

-Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che 
-il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta 
+Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che
+il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta
 `/path/to/converted/checkpoint/folder` che contenga sia
 `pytorch_model.bin` che `config.json`:

@ -583,9 +583,9 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")

 **7. Implementare il forward pass**

-Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass 
+Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass
 sia correttamente implementato. [Qui](#3-4-provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato
-uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo 
+uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo
 usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere:

 ```python
@ -594,27 +594,27 @@ input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
 output = model(input_ids).last_hidden_states
 ```

-Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - 
-é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. 
-Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. 
+Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete -
+é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass.
+Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`.
 Non esistate a chiedere al team Hugging Face!

-Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output 
-siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione 
-originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie 
+Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output
+siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione
+originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie
 di errori comuni quando gli output non sono uguali:

- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione 
- La matrice del word embedding non é stata ripareggiata 
- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset 
- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che 
+- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione
+- La matrice del word embedding non é stata ripareggiata
+- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset
+- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che
 il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)

-La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers 
-fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le 
-implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra 
-i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino 
-all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. 
+La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers
+fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le
+implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra
+i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino
+all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto.

 Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`.
 A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊.
@ -622,9 +622,9 @@ A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una p

 **8. Aggiungere i test necessari per il modello**

-A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia 
+A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia
 del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é
-necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, 
+necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello,
 di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni:

 ```bash
@ -636,8 +636,8 @@ Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia
 - a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*,
 - b) Implementazioni future del vostro modello non rompano alcune feature importante del modello.

-Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di 
-debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, 
+Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di
+debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`,
 voi dovrete solo completarlo. Una volta che questi tests sono OK, provate:

 ```bash
@ -650,7 +650,7 @@ Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1`

 </Tip>

-Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, 
+Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati,
 contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per:


@ -664,7 +664,7 @@ A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il to

 É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers.

-Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input 
+Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input
 una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice:

 ```python
@ -673,8 +673,8 @@ model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = model.tokenize(input_str)
 ```

-Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. 
-A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. 
+Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer.
+A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`.
 A quel punto uno script analogo é necessario in 🤗 Transformers:

 ```python
@ -687,7 +687,7 @@ tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
 input_ids = tokenizer(input_str).input_ids
 ```

-Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. 
+Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer.

 Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione.

@ -696,22 +696,22 @@ Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-c

 Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer.
 Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di  🤗 Transformers.
-*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. 
-Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. 
-Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. 
+*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via.
+Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti.
+Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU.
 Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere
-a testare questo aspetto per voi. 
+a testare questo aspetto per voi.

 **11. Aggiungere una Docstring**

-Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già 
-un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà 
-per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto 
-utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face 
-riguardo alle docstirng. 
+Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già
+un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà
+per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto
+utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face
+riguardo alle docstirng.

-Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. 
-Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per 
+Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`.
+Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per
 scrivere la documentazione e docstring.


@ -729,8 +729,8 @@ E che il codice passi i quality check:
 make quality
 ```

-A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. 
-Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. 
+A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra.
+Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi.

 Per ultimo, fare del refactoring del codice una volta che é stato creato.

@ -738,10 +738,10 @@ Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎

 **12. Caricare il modello sul model hub**

-In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una 
-model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per 
-avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok 
-per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. 
+In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una
+model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per
+avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok
+per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*.
 Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub:

 ```python
@ -754,27 +754,27 @@ brand_new_bert.push_to_hub(
 )
 ```

-Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero 
-suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. 
+Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero
+suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned.
 O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente.


 **13. (Opzionale) Aggiungere un notebook**

-É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o 
+É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o
 fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community.

 **14. Sottomettere la PR**

-L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, 
+L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato,
 ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice.


 ### Condividete il vostro lavoro!!

-É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello 
-é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara 
-sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro 
-traguardo con l'intera community :) 
+É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello
+é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara
+sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro
+traguardo con l'intera community :)

 ** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯**
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@ -18,10 +18,10 @@ in modelli che possono essere caricati utilizzando i metodi `from_pretrained` de

 <Tip>

-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione
+A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers**), disponibile in ogni installazione
 di transformers >=2.3.0.

-La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.
+La seguente documentazione riflette il formato dei comandi di **transformers convert**.

 </Tip>

@ -49,7 +49,7 @@ Questo è un esempio del processo di conversione per un modello `BERT-Base Uncas

 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-transformers-cli convert --model_type bert \
+transformers convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -70,7 +70,7 @@ Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-

 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base
-transformers-cli convert --model_type albert \
+transformers convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -84,7 +84,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenat
 sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-transformers-cli convert --model_type gpt \
+transformers convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -97,7 +97,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allen

 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-transformers-cli convert --model_type gpt2 \
+transformers convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -111,7 +111,7 @@ Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
 ```bash
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-transformers-cli convert --model_type xlnet \
+transformers convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -124,7 +124,7 @@ Ecco un esempio del processo di conversione di un modello XLM pre-allenato:

 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-transformers-cli convert --model_type xlm \
+transformers convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -137,7 +137,7 @@ Ecco un esempio del processo di conversione di un modello T5 pre-allenato:

 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12
-transformers-cli convert --model_type t5 \
+transformers convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -312,7 +312,7 @@ cd transformers
 既存のモデル:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 モデルの基本情報を入力するためのアンケートが表示されます。
@ -517,7 +517,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,

 スクリプト内の変換スクリプトでは、ランダムに初期化された重みを、対応するチェックポイント内の正確な重みで埋める必要があります。例えば、以下のように翻訳します：

- 
+
 ```python
 # retrieve matching layer weights, e.g. by
 # recursive algorithm
@ -747,5 +747,3 @@ brand_new_bert.push_to_hub("brand_new_bert")
 さあ、コミュニティからあなたの作業に対する評価を得る時が来ました！モデルの追加を完了することは、TransformersおよびNLPコミュニティにとって重要な貢献です。あなたのコードとポートされた事前学習済みモデルは、何百人、何千人という開発者や研究者によって確実に使用されるでしょう。あなたの仕事に誇りを持ち、コミュニティとあなたの成果を共有しましょう。

 **あなたはコミュニティの誰でも簡単にアクセスできる別のモデルを作成しました！ 🤯**
-
-
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -77,6 +77,8 @@
        title: 이미지 특징 추출
      - local: tasks/mask_generation
        title: 마스크 생성
+      - local: tasks/keypoint_detection
+        title: 키포인트 탐지
      - local: tasks/knowledge_distillation_for_image_classification
        title: 컴퓨터 비전(이미지 분류)를 위한 지식 증류(knowledge distillation)
    title: 컴퓨터 비전
@ -352,8 +354,8 @@
        title: (번역중) DistilBERT
      - local: in_translation
        title: (번역중) DPR
-      - local: in_translation
-        title: (번역중) ELECTRA
+      - local: model_doc/electra
+        title: ELECTRA
      - local: model_doc/encoder-decoder
        title: 인코더 디코더 모델
      - local: in_translation
@ -480,8 +482,8 @@
        title: (번역중) RemBERT
      - local: in_translation
        title: (번역중) RetriBERT
-      - local: in_translation
-        title: (번역중) RoBERTa
+      - local: model_doc/roberta
+        title: RoBERTa
      - local: in_translation
        title: (번역중) RoBERTa-PreLayerNorm
      - local: in_translation
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -73,7 +73,7 @@ model.config  # model has access to its config
 5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다.

 ### 토크나이저 개요 [[overview-of-tokenizers]]
- 
+
 아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다!

 ## 🤗 Transformers에 모델 추가하는 단계별 방법  [[stepbystep-recipe-to-add-a-model-to-transformers]]
@ -272,7 +272,7 @@ cd transformers
 기존 모델:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 모델의 기본 정보를 입력하는 설문지가 표시됩니다.
--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@ -63,7 +63,7 @@ limitations under the License.
 운영체제와 소프트웨어 버전을 자동으로 가져오려면 다음 명령을 실행하세요:

 ```bash
-transformers-cli env
+transformers env
 ```

 저장소의 루트 디렉터리에서도 같은 명령을 실행할 수 있습니다:
--- a/docs/source/ko/deepspeed.md
+++ b/docs/source/ko/deepspeed.md
@ -1165,7 +1165,7 @@ python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'

 ### DeepSpeed 프로세스가 시작 단계에서 종료되었을 경우[[deepspeed-process-killed-at-startup]]

-실행 중에 트레이스백 없이 DeepSpeed 프로세스가 종료되면 일반적으로 프로그램이 시스템보다 많은 CPU 메모리를 할당하려고 시도했거나 프로세스가 허용된 것보다 많은 CPU 메모리를 할당하려고 시도하여 OS 커널이 프로세스를 종료했음을 의미합니다. 이 경우 구성 파일에 `offload_optimizer`, `offload_param` 또는 둘 다 CPU로 오프로드하도록 구성되어 있는지 확인하세요.  
+실행 중에 트레이스백 없이 DeepSpeed 프로세스가 종료되면 일반적으로 프로그램이 시스템보다 많은 CPU 메모리를 할당하려고 시도했거나 프로세스가 허용된 것보다 많은 CPU 메모리를 할당하려고 시도하여 OS 커널이 프로세스를 종료했음을 의미합니다. 이 경우 구성 파일에 `offload_optimizer`, `offload_param` 또는 둘 다 CPU로 오프로드하도록 구성되어 있는지 확인하세요.

 NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(모델의 메모리 요구 사항을 [확인](https://deepspeed.readthedocs.io/en/latest/memory.html)하세요).

@ -1211,7 +1211,7 @@ NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(

 ## 리소스[[resources]]

-DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요. 
+DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요.

 다음 문서도 ZeRO에 대해 자세히 알아볼 수 있는 훌륭한 자료입니다:

--- a/docs/source/ko/model_doc/electra.md
+++ b/docs/source/ko/model_doc/electra.md
@ -0,0 +1,196 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ELECTRA[[electra]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
+## 개요[[overview]]
+
+ELECTRA 모델은 [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
+Generators](https://openreview.net/pdf?id=r1xMH1BtvB) 논문에서 제안되었습니다. ELECTRA는 두가지 트랜스포머 모델인 생성 모델과 판별 모델을 학습시키는 새로운 사전학습 접근법입니다. 생성 모델의 역할은 시퀀스에 있는 토큰을 대체하는 것이며 마스킹된 언어 모델로 학습됩니다. 우리가 관심을 가진 판별 모델은 시퀀스에서 어떤 토큰이 생성 모델에 의해 대체되었는지 식별합니다. 
+
+논문의 초록은 다음과 같습니다:
+
+*BERT와 같은 마스킹된 언어 모델(MLM) 사전학습 방법은 일부 토큰을 [MASK] 토큰으로 바꿔 손상시키고 난 뒤, 모델이 다시 원본 토큰을 복원하도록 학습합니다. 이런 방식은 다운스트림 NLP 작업을 전이할 때 좋은 성능을 내지만, 효과적으로 사용하기 위해서는 일반적으로 많은 양의 연산이 필요합니다. 따라서 대안으로, 대체 토큰 탐지라고 불리는 샘플-효과적인 사전학습을 제안합니다. 우리의 방법론은 입력에 마스킹을 하는 대신에 소형 생성 모델의 그럴듯한 대안 토큰으로 손상시킵니다. 그리고 나서, 모델이 손상된 토큰의 원래 토큰을 예측하도록 훈련시키는 대신, 판별 모델을 각각의 토큰이 생성 모델의 샘플로 손상되었는지 아닌지 학습합니다. 실험들은 통해 이 새로운 사전학습 방식은 마스킹된 일부 토큰에만 적용되는 기존 방식과 달리 모든 입력 토큰에 대해 학습이 이뤄지기 때문에 마스킹된 언어 모델(MLM)보다 더 효율적임을 입증하였습니다. 결과적으로 소개된 방식이 같은 모델 크기, 데이터, 연산량을 가진 BERT모델로 학습한 결과를 압도하는 문맥 표현 학습을 할 수 있다는 것을 확인했습니다. 특히 작은 모델에서 성능 향상이 두드러지며, 예를 들어 GPU 한 대로 4일간 학습한 모델이 30배 더 많은 계산 자원을 사용한 GPT보다 GLUE 자연어 이해 벤치마크에서 더 나은 성능을 보입니다. 대규모 환경에서도 유효하며 더 적은 연산량으로 RoBERTa와 XLNet과 비슷한 성능을 낼 수 있으며, 동일한 연산량을 가질 경우 이들의 성능을 능가합니다.*
+
+
+이 모델은 [lysandre](https://huggingface.co/lysandre)이 기여했습니다. 원본 코드는 [이곳](https://github.com/google-research/electra)에서 찾아보실 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+- ELECTRA는 사전학습 방법으로 기본 모델인 BERT의 구조와 거의 차이가 없습니다. 유일한 차이는 임베딩 크기와 히든 크기를 구분했다는 점입니다. 임베딩 크기는 일반적으로 더 작고, 히든 크기는 더 큽니다. 임베딩에서 임베딩 크기를 히든 크기로 변환하기 위해 추가로 선형 변환 층이 사용됩니다. 임베딩 크기와 히든 크기가 동일할 경우에는 이 선형 변환 층이 필요하지 않습니다. 
+- ELECTRA는 또 다른 (작은) 마스킹된 언어 모델을 사용해 사전학습 된 트랜스포머 모델입니다.  작은 언어 모델이 입력 텍스트의 일부를 무작위로 마스킹하고, 그 자리에 새로운 토큰을 삽입합니다. ELECTRA는 원래 토큰과 대체된 토큰을 구분하는 역할을 수행합니다. GAN 훈련과 비슷하지만, 생성 모델은 ELECTRA 모델을 속이는 것이 아니라 원래 텍스트를 복원하는 목표로 몇 단계 학습합니다. 그 후 ELECTRA가 학습을 하게 됩니다.
+- [구글 리서치의 구현](https://github.com/google-research/electra)으로 저장된 ELECTRA checkpoints는 생성 모델과 판별 모델을 포함합니다. 변환 스크립트에서는 사용자가 어떤 모델을 어떤 아키텍처로 내보낼지 명시해야 합니다. 일단 Hugging Face 포맷으로 변환되면, 이 체크포인트들은 모든 ELECTRA 모델에서 불러올 수 있습니다. 즉, 판별 모델은 [`ElectraForMaskedLM`] 모델에, 생성 모델은 [`ElectraForPreTraining`]모델에 불러올 수 있다는 의미입니다. (단, 생성 모델에는 분류 헤드가 존재하지 않기 때문에, 해당 부분은 무작위로 초기화됩니다.)
+
+## 참고 자료[[resources]]
+
+- [텍스트 분류 가이드](../tasks/sequence_classification)
+- [토큰 분류 가이드](../tasks/token_classification)
+- [질의 응답 가이드](../tasks/question_answering)
+- [인과 언어 모델링 가이드](../tasks/language_modeling)
+- [마스킹된 언어 모델링 가이드](../tasks/masked_language_modeling)
+- [객관식 문제 가이드](../tasks/multiple_choice)
+
+## ElectraConfig
+
+[[autodoc]] ElectraConfig
+
+## ElectraTokenizer
+
+[[autodoc]] ElectraTokenizer
+
+## ElectraTokenizerFast
+
+[[autodoc]] ElectraTokenizerFast
+
+## Electra specific outputs
+
+[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput
+
+[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
+
+<frameworkcontent>
+<pt>
+
+## ElectraModel
+
+[[autodoc]] ElectraModel
+    - forward
+
+## ElectraForPreTraining
+
+[[autodoc]] ElectraForPreTraining
+    - forward
+
+## ElectraForCausalLM
+
+[[autodoc]] ElectraForCausalLM
+    - forward
+
+## ElectraForMaskedLM
+
+[[autodoc]] ElectraForMaskedLM
+    - forward
+
+## ElectraForSequenceClassification
+
+[[autodoc]] ElectraForSequenceClassification
+    - forward
+
+## ElectraForMultipleChoice
+
+[[autodoc]] ElectraForMultipleChoice
+    - forward
+
+## ElectraForTokenClassification
+
+[[autodoc]] ElectraForTokenClassification
+    - forward
+
+## ElectraForQuestionAnswering
+
+[[autodoc]] ElectraForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFElectraModel
+
+[[autodoc]] TFElectraModel
+    - call
+
+## TFElectraForPreTraining
+
+[[autodoc]] TFElectraForPreTraining
+    - call
+
+## TFElectraForMaskedLM
+
+[[autodoc]] TFElectraForMaskedLM
+    - call
+
+## TFElectraForSequenceClassification
+
+[[autodoc]] TFElectraForSequenceClassification
+    - call
+
+## TFElectraForMultipleChoice
+
+[[autodoc]] TFElectraForMultipleChoice
+    - call
+
+## TFElectraForTokenClassification
+
+[[autodoc]] TFElectraForTokenClassification
+    - call
+
+## TFElectraForQuestionAnswering
+
+[[autodoc]] TFElectraForQuestionAnswering
+    - call
+
+</tf>
+<jax>
+
+## FlaxElectraModel
+
+[[autodoc]] FlaxElectraModel
+    - __call__
+
+## FlaxElectraForPreTraining
+
+[[autodoc]] FlaxElectraForPreTraining
+    - __call__
+
+## FlaxElectraForCausalLM
+
+[[autodoc]] FlaxElectraForCausalLM
+    - __call__
+
+## FlaxElectraForMaskedLM
+
+[[autodoc]] FlaxElectraForMaskedLM
+    - __call__
+
+## FlaxElectraForSequenceClassification
+
+[[autodoc]] FlaxElectraForSequenceClassification
+    - __call__
+
+## FlaxElectraForMultipleChoice
+
+[[autodoc]] FlaxElectraForMultipleChoice
+    - __call__
+
+## FlaxElectraForTokenClassification
+
+[[autodoc]] FlaxElectraForTokenClassification
+    - __call__
+
+## FlaxElectraForQuestionAnswering
+
+[[autodoc]] FlaxElectraForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
--- a/docs/source/ko/model_doc/roberta.md
+++ b/docs/source/ko/model_doc/roberta.md
@ -0,0 +1,230 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoBERTa[[roberta]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## 개요[[overview]]
+
+RoBERTa 모델은 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov가 제안한 논문 [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)에서 소개되었습니다. 이 모델은 2018년에 구글에서 발표한 BERT 모델을 기반으로 합니다.
+
+RoBERTa는 BERT를 기반으로 하며, 주요 하이퍼파라미터를 수정하고, 사전 학습 단계에서 다음 문장 예측(Next Sentence Prediction)을 제거했으며, 훨씬 더 큰 미니 배치 크기와 학습률을 사용하여 학습을 진행했습니다.
+
+해당 논문의 초록입니다:
+
+*언어 모델 사전 학습은 성능을 크게 향상시켰지만, 서로 다른 접근 방식을 면밀히 비교하는 것은 어렵습니다. 학습은 계산 비용이 많이 들고, 종종 크기가 서로 다른 비공개 데이터셋에서 수행되며, 본 논문에서 보여주듯이 하이퍼파라미터 선택이 최종 성능에 큰 영향을 미칩니다. 우리는 BERT 사전 학습(Devlin et al., 2019)에 대한 재현 연구를 수행하여, 여러 핵심 하이퍼파라미터와 학습 데이터 크기의 영향을 면밀히 측정하였습니다. 그 결과, BERT는 충분히 학습되지 않았으며, 이후 발표된 모든 모델의 성능을 맞추거나 능가할 수 있음을 발견했습니다. 우리가 제안한 최상의 모델은 GLUE, RACE, SQuAD에서 최고 성능(state-of-the-art)을 달성했습니다. 이 결과는 지금까지 간과되어 온 설계 선택의 중요성을 강조하며, 최근 보고된 성능 향상의 근원이 무엇인지에 대한 의문을 제기합니다. 우리는 본 연구에서 사용한 모델과 코드를 공개합니다.*
+
+이 모델은 [julien-c](https://huggingface.co/julien-c)가 기여하였습니다. 원본 코드는 [여기](https://github.com/pytorch/fairseq/tree/master/examples/roberta)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+- 이 구현은 [`BertModel`]과 동일하지만, 임베딩 부분에 약간의 수정이 있으며 RoBERTa 사전학습 모델에 맞게 설정되어 있습니다.
+- RoBERTa는 BERT와 동일한 아키텍처를 가지고 있지만, 토크나이저로 바이트 수준 BPE(Byte-Pair Encoding, GPT-2와 동일)를 사용하고, 사전학습 방식이 다릅니다.
+- RoBERTa는 `token_type_ids`를 사용하지 않기 때문에, 어떤 토큰이 어떤 문장(segment)에 속하는지 별도로 표시할 필요가 없습니다. 문장 구분은 분리 토큰 `tokenizer.sep_token`(또는 `</s>`)을 사용해 나누면 됩니다.
+- RoBERTa는 BERT와 유사하지만, 더 나은 사전학습 기법을 사용합니다:
+
+    * 동적 마스킹: RoBERTa는 매 에폭마다 토큰을 다르게 마스킹하는 반면, BERT는 한 번만 마스킹합니다.
+    * 문장 패킹: 여러 문장을 최대 512 토큰까지 함께 패킹하여, 문장이 여러 문서에 걸쳐 있을 수도 있습니다.
+    * 더 큰 배치 사이즈: 학습 시 더 큰 미니배치를 사용합니다.
+    * 바이트 수준 BPE 어휘: 문자를 단위로 하지 않고 바이트 단위로 BPE를 적용하여 유니코드 문자를 더 유연하게 처리할 수 있습니다.
+
+- [CamemBERT](camembert)은 RoBERTa를 기반으로 한 래퍼 모델입니다. 사용 예제는 해당 모델 페이지를 참고하세요.
+
+## 자료[[resources]]
+
+RoBERTa를 처음 다룰 때 도움이 되는 Hugging Face 공식 자료와 커뮤니티 자료(🌎 아이콘으로 표시됨) 목록입니다. 이 목록에 자료를 추가하고 싶다면 언제든지 Pull Request를 보내주세요! 저희가 검토 후 반영하겠습니다. 추가하려는 자료는 기존 자료를 단순히 복제하는 것이 아닌, 새롭거나 유의미한 내용을 포함하고 있는 것이 좋습니다.
+
+<PipelineTag pipeline="text-classification"/>
+
+- RoBERTa와 [Inference API](https://huggingface.co/inference-api)를 활용한 [트위터 감성 분석 시작하기](https://huggingface.co/blog/sentiment-analysis-twitter) 블로그 포스트.
+- RoBERTa를 활용한 [Kili 및 Hugging Face AutoTrain을 이용한 의견 분류](https://huggingface.co/blog/opinion-classification-with-kili)에 관한 블로그 포스트.
+- [감성 분석을 위한 RoBERTa 미세조정](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)을 하는 방법에 대한 노트북.🌎
+- ['RobertaForSequenceClassification']은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)에서 지원됩니다.
+- [`TFRobertaForSequenceClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForSequenceClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb)에서 지원됩니다.
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`RobertaForTokenClassification`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)에서 지원됩니다.
+- [`TFRobertaForTokenClassification`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForTokenClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [토큰 분류 챕터](https://huggingface.co/course/chapter7/2?fw=pt)
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- RoBERTa를 활용한 [Transformers와 Tokenizers를 활용한 새로운 언어 모델을 처음부터 학습하는 방법](https://huggingface.co/blog/how-to-train)에 대한 블로그 포스트.
+- [`RobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)에서 지원됩니다.
+- [`TFRobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [마스킹 언어 모델링 챕터](https://huggingface.co/course/chapter7/3?fw=pt)
+- [마스킹 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- RoBERTa를 활용한 질문 응답 작업에서의 [Optimum과 Transformers 파이프라인을 이용한 추론 가속화](https://huggingface.co/blog/optimum-inference)에 대한 블로그 포스트.
+- [`RobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)에서 지원됩니다.
+- [`TFRobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [질의응답 챕터](https://huggingface.co/course/chapter7/7?fw=pt)
+- [질의응답 작업 가이드](../tasks/question_answering)
+
+**다중 선택**
+- [`RobertaForMultipleChoice`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)에서 지원됩니다.
+- [`TFRobertaForMultipleChoice`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)에서 지원됩니다.
+- [다중 선택 작업 가이드](../tasks/multiple_choice)
+
+## RobertaConfig
+
+[[autodoc]] RobertaConfig
+
+## RobertaTokenizer
+
+[[autodoc]] RobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RobertaTokenizerFast
+
+[[autodoc]] RobertaTokenizerFast
+    - build_inputs_with_special_tokens
+
+<frameworkcontent>
+<pt>
+
+## RobertaModel
+
+[[autodoc]] RobertaModel
+    - forward
+
+## RobertaForCausalLM
+
+[[autodoc]] RobertaForCausalLM
+    - forward
+
+## RobertaForMaskedLM
+
+[[autodoc]] RobertaForMaskedLM
+    - forward
+
+## RobertaForSequenceClassification
+
+[[autodoc]] RobertaForSequenceClassification
+    - forward
+
+## RobertaForMultipleChoice
+
+[[autodoc]] RobertaForMultipleChoice
+    - forward
+
+## RobertaForTokenClassification
+
+[[autodoc]] RobertaForTokenClassification
+    - forward
+
+## RobertaForQuestionAnswering
+
+[[autodoc]] RobertaForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFRobertaModel
+
+[[autodoc]] TFRobertaModel
+    - call
+
+## TFRobertaForCausalLM
+
+[[autodoc]] TFRobertaForCausalLM
+    - call
+
+## TFRobertaForMaskedLM
+
+[[autodoc]] TFRobertaForMaskedLM
+    - call
+
+## TFRobertaForSequenceClassification
+
+[[autodoc]] TFRobertaForSequenceClassification
+    - call
+
+## TFRobertaForMultipleChoice
+
+[[autodoc]] TFRobertaForMultipleChoice
+    - call
+
+## TFRobertaForTokenClassification
+
+[[autodoc]] TFRobertaForTokenClassification
+    - call
+
+## TFRobertaForQuestionAnswering
+
+[[autodoc]] TFRobertaForQuestionAnswering
+    - call
+
+</tf>
+<jax>
+
+## FlaxRobertaModel
+
+[[autodoc]] FlaxRobertaModel
+    - __call__
+
+## FlaxRobertaForCausalLM
+
+[[autodoc]] FlaxRobertaForCausalLM
+    - __call__
+
+## FlaxRobertaForMaskedLM
+
+[[autodoc]] FlaxRobertaForMaskedLM
+    - __call__
+
+## FlaxRobertaForSequenceClassification
+
+[[autodoc]] FlaxRobertaForSequenceClassification
+    - __call__
+
+## FlaxRobertaForMultipleChoice
+
+[[autodoc]] FlaxRobertaForMultipleChoice
+    - __call__
+
+## FlaxRobertaForTokenClassification
+
+[[autodoc]] FlaxRobertaForTokenClassification
+    - __call__
+
+## FlaxRobertaForQuestionAnswering
+
+[[autodoc]] FlaxRobertaForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
--- a/docs/source/ko/tasks/keypoint_detection.md
+++ b/docs/source/ko/tasks/keypoint_detection.md
@ -0,0 +1,155 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 키포인트 탐지 [[keypoint-detection]]
+
+[[open-in-colab]]
+
+키포인트 감지(Keypoint detection)은 이미지 내의 특정 포인트를 식별하고 위치를 탐지합니다. 이러한 키포인트는 랜드마크라고도 불리며 얼굴 특징이나 물체의 일부와 같은 의미 있는 특징을 나타냅니다.
+키포인트 감지 모델들은 이미지를 입력으로 받아 아래와 같은 출력을 반환합니다.
+
+- **키포인트들과 점수**: 관심 포인트들과 해당 포인트에 대한 신뢰도 점수
+- **디스크립터(Descriptors)**: 각 키포인트를 둘러싼 이미지 영역의 표현으로 텍스처, 그라데이션, 방향 및 기타 속성을 캡처합니다.
+
+이번 가이드에서는 이미지에서 키포인트를 추출하는 방법을 다루어 보겠습니다.
+
+이번 튜토리얼에서는 키포인트 감지의 기본이 되는 모델인 [SuperPoint](./model_doc/superpoint)를 사용해보겠습니다.
+
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+```
+아래의 이미지로 모델을 테스트 해보겠습니다.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+
+
+```python
+import torch
+from PIL import Image
+import requests
+import cv2
+
+
+url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+
+images = [image_1, image_2]
+```
+
+이제 입력을 처리하고 추론을 할 수 있습니다.
+
+
+```python
+inputs = processor(images,return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+```
+모델 출력에는 배치 내의 각 항목에 대한 상대적인 키포인트, 디스크립터, 마스크와 점수가 있습니다. 마스크는 이미지에서 키포인트가 있는 영역을 강조하는 역할을 합니다.
+
+```python
+SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167],
+         [0.0688, 0.0167],
+         [0.0172, 0.0188],
+         ...,
+         [0.5984, 0.9812],
+         [0.6953, 0.9812]]]), 
+         scores=tensor([[0.0056, 0.0053, 0.0079,  ..., 0.0125, 0.0539, 0.0377],
+        [0.0206, 0.0058, 0.0065,  ..., 0.0000, 0.0000, 0.0000]],
+       grad_fn=<CopySlices>), descriptors=tensor([[[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         ...],
+       grad_fn=<CopySlices>), mask=tensor([[1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None)
+```
+
+이미지에 실제 키포인트를 표시하기 위해선 결과값을 후처리 해야합니다. 이를 위해 실제 이미지 크기를 결과값과 함께 `post_process_keypoint_detection`에 전달해야 합니다.
+
+```python
+image_sizes = [(image.size[1], image.size[0]) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+```
+
+위 코드를 통해 결과값은 딕셔너리를 갖는 리스트가 되고, 각 딕셔너리들은 후처리된 키포인트, 점수 및 디스크립터로 이루어져있습니다.
+
+
+```python
+[{'keypoints': tensor([[ 226,   57],
+          [ 356,   57],
+          [  89,   64],
+          ...,
+          [3604, 3391]], dtype=torch.int32),
+  'scores': tensor([0.0056, 0.0053, ...], grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+          [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357]],
+         grad_fn=<IndexBackward0>)},
+    {'keypoints': tensor([[ 46,   6],
+          [ 78,   6],
+          [422,   6],
+          [206, 404]], dtype=torch.int32),
+  'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211],
+          [-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211]}]
+```
+
+이제 위 딕셔너리를 사용하여 키포인트를 표시할 수 있습니다.
+
+```python
+import matplotlib.pyplot as plt
+import torch
+
+for i in range(len(images)):
+  keypoints = outputs[i]["keypoints"]
+  scores = outputs[i]["scores"]
+  descriptors = outputs[i]["descriptors"]
+  keypoints = outputs[i]["keypoints"].detach().numpy()
+  scores = outputs[i]["scores"].detach().numpy()
+  image = images[i]
+  image_width, image_height = image.size
+
+  plt.axis('off')
+  plt.imshow(image)
+  plt.scatter(
+      keypoints[:, 0],
+      keypoints[:, 1],
+      s=scores * 100,
+      c='cyan',
+      alpha=0.4
+  )
+  plt.show()
+```
+
+아래에서 결과를 확인할 수 있습니다.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee_keypoint.png" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats_keypoint.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@ -21,10 +21,10 @@ que podem ser carregados usando os métodos `from_pretrained` da biblioteca.

 <Tip>

-A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers-cli**) disponível em qualquer instalação
+A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers**) disponível em qualquer instalação
 transformers >= 2.3.0.

-A documentação abaixo reflete o formato do comando **transformers-cli convert**.
+A documentação abaixo reflete o formato do comando **transformers convert**.

 </Tip>

@ -49,7 +49,7 @@ Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncase
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-transformers-cli convert --model_type bert \
+transformers convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -71,7 +71,7 @@ Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base

-transformers-cli convert --model_type albert \
+transformers convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -88,7 +88,7 @@ foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https:
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-transformers-cli convert --model_type gpt \
+transformers convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -102,7 +102,7 @@ Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers-cli convert --model_type gpt2 \
+transformers convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -117,7 +117,7 @@ Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treina
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-transformers-cli convert --model_type xlnet \
+transformers convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -131,7 +131,7 @@ Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-transformers-cli convert --model_type xlm \
+transformers convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -145,7 +145,7 @@ Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12

-transformers-cli convert --model_type t5 \
+transformers convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@ -63,7 +63,7 @@ limitations under the License.
 想要自动获取操作系统和软件版本，请运行以下命令：

 ```bash
-transformers-cli env
+transformers env
 ```

 你也可以从代码仓库的根目录下运行相同的命令：
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -99,7 +99,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=False,
        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "help": "Will use the token generated when running `transformers login` (necessary to use this script "
            "with private models)."
        },
    )
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@ -19,6 +19,7 @@ import time
 from json import JSONDecodeError
 from logging import getLogger
 from pathlib import Path
+from typing import Optional

 import torch
 from torch.utils.data import DataLoader
@ -54,7 +55,7 @@ def eval_data_dir(
    task="summarization",
    local_rank=None,
    num_return_sequences=1,
-    dataset_kwargs: dict = None,
+    dataset_kwargs: Optional[dict] = None,
    prefix="",
    **generate_kwargs,
 ) -> dict:
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_new_imgproc_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union

 import numpy as np
 import torch
@ -74,13 +74,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
    def __init__(
        self,
        do_resize: bool = True,
-        size: dict[str, int] = None,
+        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ) -> None:
@ -101,7 +101,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
    def resize(
        self,
        image: np.ndarray,
-        size: dict[str, int],
+        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -151,15 +151,15 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
-        size: Optional[dict[str, int]] = None,
+        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> PIL.Image.Image:
--- a/examples/modular-transformers/modeling_add_function.py
+++ b/examples/modular-transformers/modeling_add_function.py
@ -5,7 +5,7 @@
 #                          modular_add_function.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Note that zamba does not have the `apply_rotary_pos_emb` function!
-from typing import Optional
+from typing import Optional, Tuple

 import torch
 from torch import nn
@ -62,5 +62,5 @@ class TestAttention(nn.Module):
    def __init__(self):
        pass

-    def forward(self) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    def forward(self) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        _ = apply_rotary_pos_emb(1, 1, 1, 1)
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -4,27 +4,41 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_dummy.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from functools import partial
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Tuple, Union

 import torch
 from torch import nn

 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_dummy import DummyConfig


+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)


+@use_kernel_forward_from_hub("RMSNorm")
 class DummyRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
@ -63,45 +77,18 @@ class DummyRotaryEmbedding(nn.Module):
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

@ -223,12 +210,12 @@ class DummyAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -245,6 +232,7 @@ class DummyAttention(nn.Module):
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
+
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
@ -270,7 +258,7 @@ class DummyAttention(nn.Module):
        return attn_output, attn_weights


-class DummyDecoderLayer(nn.Module):
+class DummyDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: DummyConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -290,11 +278,10 @@ class DummyDecoderLayer(nn.Module):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
-
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
@ -369,6 +356,8 @@ class DummyPreTrainedModel(PreTrainedModel):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DummyRMSNorm):
+            module.weight.data.fill_(1.0)


 DUMMY_INPUTS_DOCSTRING = r"""
@ -381,12 +370,15 @@ DUMMY_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

+            If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
+            but you can also pass a `BlockMask` object directly here.
+
            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
@ -406,20 +398,12 @@ DUMMY_INPUTS_DOCSTRING = r"""
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@ -480,10 +464,11 @@ class DummyModel(DummyPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(DUMMY_INPUTS_DOCSTRING)
    def forward(
        self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
@ -491,16 +476,14 @@ class DummyModel(DummyPreTrainedModel):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@ -511,6 +494,10 @@ class DummyModel(DummyPreTrainedModel):
            )
            use_cache = False

+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

@ -543,30 +530,17 @@ class DummyModel(DummyPreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

@ -579,26 +553,29 @@ class DummyModel(DummyPreTrainedModel):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values if use_cache else None,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
-        return output if return_dict else output.to_tuple()

    def _update_causal_mask(
        self,
-        attention_mask: torch.Tensor,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@ -616,7 +593,7 @@ class DummyModel(DummyPreTrainedModel):
            ):
                return None

-        dtype, device = input_tensor.dtype, input_tensor.device
+        dtype = input_tensor.dtype
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@ -633,7 +610,6 @@ class DummyModel(DummyPreTrainedModel):
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
-            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )
@ -641,7 +617,7 @@ class DummyModel(DummyPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
@ -658,7 +634,6 @@ class DummyModel(DummyPreTrainedModel):
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
-        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
@ -678,8 +653,6 @@ class DummyModel(DummyPreTrainedModel):
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
@ -691,11 +664,11 @@ class DummyModel(DummyPreTrainedModel):
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@ -6,7 +6,7 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import os
-from typing import Optional, Union
+from typing import Optional, Tuple, Union

 import torch
 from packaging import version
@ -136,9 +136,9 @@ class DummyBertSelfAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
@ -245,9 +245,9 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
            logger.warning_once(
@ -386,9 +386,9 @@ class DummyBertAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        self_outputs = self.self(
            hidden_states,
            attention_mask,
@ -454,9 +454,9 @@ class DummyBertLayer(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        self_attention_outputs = self.attention(
@ -532,12 +532,12 @@ class DummyBertEncoder(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
-    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@ -626,6 +626,46 @@ class DummyBertPooler(nn.Module):
        return pooled_output


+class DummyBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class DummyBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DummyBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
 def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
@ -726,6 +766,8 @@ class DummyBertPreTrainedModel(PreTrainedModel):
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
+        elif isinstance(module, DummyBertLMPredictionHead):
+            module.bias.data.zero_()


 DUMMY_BERT_START_DOCSTRING = r"""
--- a/examples/modular-transformers/modeling_from_uppercase_model.py
+++ b/examples/modular-transformers/modeling_from_uppercase_model.py
@ -4,28 +4,48 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_from_uppercase_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Optional
+from typing import Callable, Optional, Tuple, Union

 import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
-from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
-from .configuration_from_uppercase_model import FromUppercaseModelConfig
-
-
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...utils import logging
+from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig


 logger = logging.get_logger(__name__)


+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    output_attentions: bool = True,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights
+
+
 class FromUppercaseModelAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

-    def __init__(self, config):
+    def __init__(self, config: Union[FromUppercaseModelVisionConfig, FromUppercaseModelTextConfig]):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
@ -38,253 +58,71 @@ class FromUppercaseModelAttention(nn.Module):
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout
+        self.is_causal = False

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

-        bsz, tgt_len, embed_dim = hidden_states.size()
+        batch_size, seq_length, embed_dim = hidden_states.shape

-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scale
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)

-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        # apply the causal_attention_mask first
-        if causal_attention_mask is not None:
-            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
-                    f" {causal_attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit akward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
        else:
-            attn_weights_reshaped = None
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped
-
-
-class FromUppercaseModelFlashAttention2(FromUppercaseModelAttention):
-    """
-    FromUppercaseModelAttention flash attention module. This module inherits from `FromUppercaseModelAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        output_attentions = False
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
            else:
-                target_dtype = self.q_proj.weight.dtype
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            is_causal=causal_attention_mask is not None,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+            output_attentions=output_attentions,
        )

-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        if not output_attentions:
            attn_weights = None
-
        return attn_output, attn_weights


-class FromUppercaseModelSdpaAttention(FromUppercaseModelAttention):
-    """
-    SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `FromUppercaseModelAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from FromUppercaseModelAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "FromUppercaseModelModel is using FromUppercaseModelSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not "
-                "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying "
-                "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can "
-                'be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                causal_attention_mask=causal_attention_mask,
-                output_attentions=output_attentions,
-            )
-
-        # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask`
-        if attention_mask is not None and causal_attention_mask is not None:
-            attn_mask = attention_mask + causal_attention_mask
-        elif causal_attention_mask is not None:
-            attn_mask = causal_attention_mask
-        else:
-            attn_mask = attention_mask
-
-        bsz, tgt_len, embed_dim = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask` sequentially.
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attn_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            scale=self.scale,
-        )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, None
-
-
 class FromUppercaseModelMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
@ -300,18 +138,11 @@ class FromUppercaseModelMLP(nn.Module):
        return hidden_states


-FROM_UPPERCASE_MODEL_ATTENTION_CLASSES = {
-    "eager": FromUppercaseModelAttention,
-    "sdpa": FromUppercaseModelSdpaAttention,
-    "flash_attention_2": FromUppercaseModelFlashAttention2,
-}
-
-
 class FromUppercaseModelEncoderLayer(nn.Module):
-    def __init__(self, config: FromUppercaseModelConfig):
+    def __init__(self, config: Union[FromUppercaseModelVisionConfig, FromUppercaseModelTextConfig]):
        super().__init__()
        self.embed_dim = config.hidden_size
-        self.self_attn = FROM_UPPERCASE_MODEL_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.self_attn = FromUppercaseModelAttention(config)
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        self.mlp = FromUppercaseModelMLP(config)
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@ -322,7 +153,7 @@ class FromUppercaseModelEncoderLayer(nn.Module):
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.FloatTensor]:
+    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -4,27 +4,41 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_multimodal1.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from functools import partial
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Tuple, Union

 import torch
 from torch import nn

 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_multimodal1 import Multimodal1TextConfig


+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)


+@use_kernel_forward_from_hub("RMSNorm")
 class Multimodal1TextRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
@ -63,45 +77,18 @@ class Multimodal1TextRotaryEmbedding(nn.Module):
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

@ -223,12 +210,12 @@ class Multimodal1TextAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -245,6 +232,7 @@ class Multimodal1TextAttention(nn.Module):
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
+
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
@ -270,7 +258,7 @@ class Multimodal1TextAttention(nn.Module):
        return attn_output, attn_weights


-class Multimodal1TextDecoderLayer(nn.Module):
+class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: Multimodal1TextConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -290,11 +278,10 @@ class Multimodal1TextDecoderLayer(nn.Module):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
-
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
@ -369,6 +356,8 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Multimodal1TextRMSNorm):
+            module.weight.data.fill_(1.0)


 MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r"""
@ -381,12 +370,15 @@ MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

+            If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
+            but you can also pass a `BlockMask` object directly here.
+
            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
@ -406,20 +398,12 @@ MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r"""
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@ -480,10 +464,11 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(MULTIMODAL1_TEXT_INPUTS_DOCSTRING)
    def forward(
        self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
@ -491,16 +476,14 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@ -511,6 +494,10 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
            )
            use_cache = False

+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

@ -543,30 +530,17 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

@ -579,26 +553,29 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values if use_cache else None,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
-        return output if return_dict else output.to_tuple()

    def _update_causal_mask(
        self,
-        attention_mask: torch.Tensor,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@ -616,7 +593,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
            ):
                return None

-        dtype, device = input_tensor.dtype, input_tensor.device
+        dtype = input_tensor.dtype
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@ -633,7 +610,6 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
-            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )
@ -641,7 +617,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
@ -658,7 +634,6 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
-        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
@ -678,8 +653,6 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
@ -691,11 +664,11 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -5,7 +5,7 @@
 #                          modular_multimodal2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

-from typing import Optional, Union
+from typing import Callable, Optional, Tuple, Union

 import torch
 from torch import nn
@ -14,30 +14,48 @@ from transformers.utils import add_start_docstrings

 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import (
    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
    logging,
    replace_return_docstrings,
    torch_int,
 )
-from .configuration_multimodal2 import Multimodal2Config, Multimodal2VisionConfig
-
-
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
+from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig


 logger = logging.get_logger(__name__)


+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    output_attentions: bool = True,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights
+
+
 class Multimodal2VisionAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

-    def __init__(self, config):
+    def __init__(self, config: Union[Multimodal2VisionConfig, Multimodal2TextConfig]):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
@ -50,250 +68,68 @@ class Multimodal2VisionAttention(nn.Module):
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout
+        self.is_causal = False

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

-        bsz, tgt_len, embed_dim = hidden_states.size()
+        batch_size, seq_length, embed_dim = hidden_states.shape

-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scale
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        # apply the causal_attention_mask first
-        if causal_attention_mask is not None:
-            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
-                    f" {causal_attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit akward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped
-
-
-class Multimodal2VisionSdpaAttention(Multimodal2VisionAttention):
-    """
-    SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Multimodal2VisionAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Multimodal2VisionAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Multimodal2VisionModel is using Multimodal2VisionSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not "
-                "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying "
-                "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can "
-                'be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                causal_attention_mask=causal_attention_mask,
-                output_attentions=output_attentions,
-            )
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)

+        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        # MULTIMODAL2_VISION text model uses both `causal_attention_mask` and `attention_mask`
-        if attention_mask is not None and causal_attention_mask is not None:
-            attn_mask = attention_mask + causal_attention_mask
-        elif causal_attention_mask is not None:
-            attn_mask = causal_attention_mask
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
        else:
-            attn_mask = attention_mask
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

-        bsz, tgt_len, embed_dim = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # MULTIMODAL2_VISION text model uses both `causal_attention_mask` and `attention_mask` sequentially.
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attn_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            scale=self.scale,
-        )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, None
-
-
-class Multimodal2VisionFlashAttention2(Multimodal2VisionAttention):
-    """
-    Multimodal2VisionAttention flash attention module. This module inherits from `Multimodal2VisionAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        output_attentions = False
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
            else:
-                target_dtype = self.q_proj.weight.dtype
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            is_causal=causal_attention_mask is not None,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+            output_attentions=output_attentions,
        )

-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        if not output_attentions:
            attn_weights = None
-
        return attn_output, attn_weights


@ -312,18 +148,92 @@ class Multimodal2VisionMLP(nn.Module):
        return hidden_states


-MULTIMODAL2_VISION_ATTENTION_CLASSES = {
-    "eager": Multimodal2VisionAttention,
-    "sdpa": Multimodal2VisionSdpaAttention,
-    "flash_attention_2": Multimodal2VisionFlashAttention2,
-}
+class Multimodal2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[Multimodal2VisionConfig, Multimodal2TextConfig]):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # MULTIMODAL2 text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights


 class Multimodal2VisionEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.hidden_size
-        self.self_attn = MULTIMODAL2_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.self_attn = Multimodal2Attention(config)
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        self.mlp = Multimodal2VisionMLP(config)
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@ -334,7 +244,7 @@ class Multimodal2VisionEncoderLayer(nn.Module):
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.FloatTensor]:
+    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@ -384,6 +294,7 @@ class Multimodal2VisionEncoder(nn.Module):
        self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

+    @can_return_tuple
    def forward(
        self,
        inputs_embeds,
@ -391,8 +302,7 @@ class Multimodal2VisionEncoder(nn.Module):
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, BaseModelOutput]:
+    ) -> BaseModelOutput:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@ -426,7 +336,6 @@ class Multimodal2VisionEncoder(nn.Module):
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
@ -459,10 +368,10 @@ class Multimodal2VisionEncoder(nn.Module):
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
        )


@ -578,6 +487,7 @@ class Multimodal2VisionTransformer(nn.Module):
        self.encoder = Multimodal2VisionEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
    def forward(
@ -585,9 +495,8 @@ class Multimodal2VisionTransformer(nn.Module):
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        interpolate_pos_encoding: Optional[bool] = False,
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
        r"""
        Returns:

@ -596,7 +505,6 @@ class Multimodal2VisionTransformer(nn.Module):
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
@ -604,20 +512,16 @@ class Multimodal2VisionTransformer(nn.Module):
        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
        hidden_states = self.pre_layrnorm(hidden_states)

-        encoder_outputs = self.encoder(
+        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
        )

-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
        pooled_output = last_hidden_state[:, 0, :]
        pooled_output = self.post_layernorm(pooled_output)

-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
@ -662,6 +566,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
    def forward(
@ -670,8 +575,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
        r"""
        Returns:

@ -694,12 +598,10 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
            interpolate_pos_encoding=interpolate_pos_encoding,
        )
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_my_new_model2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Union
+from typing import Callable, List, Optional, Tuple, Union

 import torch
 from torch import nn
@ -13,14 +13,27 @@ from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_my_new_model2 import MyNewModel2Config


+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)


@ -78,45 +91,18 @@ class MyNewModel2RotaryEmbedding(nn.Module):
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

@ -222,12 +208,12 @@ class MyNewModel2Attention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -244,6 +230,7 @@ class MyNewModel2Attention(nn.Module):
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
+
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
@ -269,7 +256,7 @@ class MyNewModel2Attention(nn.Module):
        return attn_output, attn_weights


-class MyNewModel2DecoderLayer(nn.Module):
+class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: MyNewModel2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -289,11 +276,10 @@ class MyNewModel2DecoderLayer(nn.Module):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
-
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
@ -368,6 +354,8 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MyNewModel2RMSNorm):
+            module.weight.data.fill_(1.0)


 MY_NEW_MODEL2_INPUTS_DOCSTRING = r"""
@ -380,12 +368,15 @@ MY_NEW_MODEL2_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

+            If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
+            but you can also pass a `BlockMask` object directly here.
+
            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
@ -405,20 +396,12 @@ MY_NEW_MODEL2_INPUTS_DOCSTRING = r"""
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@ -479,27 +462,26 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
    def forward(
        self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,  # NOOP kwarg for now
-    ) -> Union[tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@ -549,29 +531,16 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )

            hidden_states = layer_outputs[0]

@ -584,26 +553,29 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values if use_cache else None,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
-        return output if return_dict else output.to_tuple()

    def _update_causal_mask(
        self,
-        attention_mask: torch.Tensor,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@ -621,7 +593,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            ):
                return None

-        dtype, device = input_tensor.dtype, input_tensor.device
+        dtype = input_tensor.dtype
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@ -638,7 +610,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
-            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )
@ -646,7 +617,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
@ -663,7 +634,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
-        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
@ -683,8 +653,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
@ -696,11 +664,11 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
@ -747,29 +715,28 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
@ -778,9 +745,8 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
        )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
        logits = self.score(hidden_states)

        if input_ids is not None:
@ -795,7 +761,7 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
        elif input_ids is not None:
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
@ -810,10 +776,6 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -5,7 +5,7 @@
 #                          modular_new_task_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar, List, Optional, Tuple, Union

 import torch
 from torch import nn
@ -59,10 +59,10 @@ class NewTaskModelCausalLMOutputWithPast(ModelOutput):
    """

    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
-    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[tuple[torch.FloatTensor]] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
    image_hidden_states: Optional[torch.FloatTensor] = None


@ -113,23 +113,12 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        # important: this ported version of NewTaskModelisn't meant for training from scratch - only
        # inference and fine-tuning
-        std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
-        )
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)

-        if hasattr(module, "class_embedding"):
-            module.class_embedding.data.normal_(mean=0.0, std=std)
-
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
+        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()


 NEW_TASK_MODEL_INPUTS_DOCSTRING = r"""
@ -251,19 +240,22 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
    def _update_causal_mask(
        self,
        attention_mask,
-        token_type_ids,
-        past_key_values,
-        cache_position,
-        input_tensor,
-        is_training: bool = False,
+        token_type_ids=None,
+        past_key_values=None,
+        cache_position=None,
+        input_tensor=None,
+        is_training: Optional[bool] = None,
    ):
        if self.config.text_config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None
-
+        is_training = is_training if is_training is not None else self.training
        using_static_cache = isinstance(past_key_values, StaticCache)
        min_dtype = torch.finfo(self.dtype).min
+        if input_tensor is None:
+            input_tensor = attention_mask
+
        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@ -298,6 +290,8 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):

            # First unmask prefix tokens during training
            if is_training:
+                if token_type_ids is None:
+                    raise ValueError("Token type ids must be provided during training")
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                )
@ -345,7 +339,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        num_logits_to_keep: int = 0,
-    ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
+    ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
        r"""
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@ -368,19 +362,19 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        >>> import requests
        >>> from transformers import AutoProcessor, NewTaskModelForNewTask

-        >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf")
-        >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf")
+        >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")

-        >>> prompt = "answer en Where is the cow standing?"
-        >>> url = "https://huggingface.co/gv-hf/NewTaskModel-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "answer en Where is the cow standing?\nbeach"
+        "Where is the cat standing?\nsnow"
        ```
        Returns:
        """
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@ -6,7 +6,7 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import os
-from typing import Optional, Union
+from typing import List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
@ -139,9 +139,9 @@ class RobertaSelfAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
@ -248,9 +248,9 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
            logger.warning_once(
@ -389,9 +389,9 @@ class RobertaAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        self_outputs = self.self(
            hidden_states,
            attention_mask,
@ -457,9 +457,9 @@ class RobertaLayer(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        self_attention_outputs = self.attention(
@ -535,12 +535,12 @@ class RobertaEncoder(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
-    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@ -629,6 +629,46 @@ class RobertaPooler(nn.Module):
        return pooled_output


+class RobertaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RobertaLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RobertaPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
 def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
@ -729,6 +769,8 @@ class RobertaPreTrainedModel(PreTrainedModel):
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
+        elif isinstance(module, RobertaLMPredictionHead):
+            module.bias.data.zero_()


 ROBERTA_START_DOCSTRING = r"""
@ -861,12 +903,12 @@ class RobertaModel(RobertaPreTrainedModel):
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[list[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -4,26 +4,42 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_super.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Tuple, Union

 import torch
 from torch import nn

+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from ...activations import ACT2FN
 from ...cache_utils import Cache, StaticCache
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_super import SuperConfig


+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)


+@use_kernel_forward_from_hub("RMSNorm")
 class SuperRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
@ -62,45 +78,18 @@ class SuperRotaryEmbedding(nn.Module):
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

@ -222,12 +211,12 @@ class SuperAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -244,6 +233,7 @@ class SuperAttention(nn.Module):
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
+
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
@ -269,7 +259,7 @@ class SuperAttention(nn.Module):
        return attn_output, attn_weights


-class SuperDecoderLayer(nn.Module):
+class SuperDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: SuperConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -289,11 +279,10 @@ class SuperDecoderLayer(nn.Module):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
-
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
@ -368,6 +357,8 @@ class SuperPreTrainedModel(PreTrainedModel):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, SuperRMSNorm):
+            module.weight.data.fill_(1.0)


 SUPER_INPUTS_DOCSTRING = r"""
@ -380,12 +371,15 @@ SUPER_INPUTS_DOCSTRING = r"""
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

+            If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
+            but you can also pass a `BlockMask` object directly here.
+
            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
@ -405,20 +399,12 @@ SUPER_INPUTS_DOCSTRING = r"""
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@ -479,6 +465,7 @@ class SuperModel(SuperPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

+    @can_return_tuple
    @add_start_docstrings_to_model_forward(SUPER_INPUTS_DOCSTRING)
    def forward(
        self,
@ -492,7 +479,7 @@ class SuperModel(SuperPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[tuple, BaseModelOutputWithPast]:
+    ) -> Union[tuple, CausalLMOutputWithPast]:
        out = super().forward(
            input_ids,
            attention_mask,
@ -510,16 +497,20 @@ class SuperModel(SuperPreTrainedModel):

    def _update_causal_mask(
        self,
-        attention_mask: torch.Tensor,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and (attention_mask == 0.0).any():
                return attention_mask
            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask

        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@ -537,7 +528,7 @@ class SuperModel(SuperPreTrainedModel):
            ):
                return None

-        dtype, device = input_tensor.dtype, input_tensor.device
+        dtype = input_tensor.dtype
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@ -554,7 +545,6 @@ class SuperModel(SuperPreTrainedModel):
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
-            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )
@ -562,7 +552,7 @@ class SuperModel(SuperPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
@ -579,7 +569,6 @@ class SuperModel(SuperPreTrainedModel):
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
-        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
@ -599,8 +588,6 @@ class SuperModel(SuperPreTrainedModel):
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
@ -612,11 +599,11 @@ class SuperModel(SuperPreTrainedModel):
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
--- a/examples/modular-transformers/modeling_switch_function.py
+++ b/examples/modular-transformers/modeling_switch_function.py
@ -5,7 +5,7 @@
 #                          modular_switch_function.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Note that llama and cohere have different definitions for rotate_half
-from typing import Callable, Optional
+from typing import Callable, Optional, Tuple

 import torch
 from torch import nn
@ -123,12 +123,12 @@ class SwitchFunctionAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -145,6 +145,7 @@ class SwitchFunctionAttention(nn.Module):
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        attention_interface: Callable = eager_attention_forward
+
        if self.config._attn_implementation != "eager":
            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
                logger.warning_once(
--- a/examples/modular-transformers/modeling_test_detr.py
+++ b/examples/modular-transformers/modeling_test_detr.py
--- a/examples/modular-transformers/modular_multimodal2.py
+++ b/examples/modular-transformers/modular_multimodal2.py
@ -16,9 +16,7 @@ from transformers.models.clip.modeling_clip import (
    CLIPAttention,
    CLIPEncoder,
    CLIPEncoderLayer,
-    CLIPFlashAttention2,
    CLIPPreTrainedModel,
-    CLIPSdpaAttention,
    CLIPVisionModel,
    CLIPVisionTransformer,
 )
@ -29,23 +27,6 @@ class Multimodal2VisionAttention(CLIPAttention):
    pass


-# Check that adding the second base class correctly set the parent, even though in clip it does not have the "Vision" part
-class Multimodal2VisionSdpaAttention(CLIPSdpaAttention, Multimodal2VisionAttention):
-    pass
-
-
-# Check that adding the second base class correctly set the parent, even though in clip it does not have the "Vision" part
-class Multimodal2VisionFlashAttention2(CLIPFlashAttention2, Multimodal2VisionAttention):
-    pass
-
-
-MULTIMODAL2_VISION_ATTENTION_CLASSES = {
-    "eager": Multimodal2VisionAttention,
-    "sdpa": Multimodal2VisionSdpaAttention,
-    "flash_attention_2": Multimodal2VisionFlashAttention2,
-}
-
-
 class Multimodal2VisionMLP(CLIPMLP):
    pass

@ -53,7 +34,6 @@ class Multimodal2VisionMLP(CLIPMLP):
 class Multimodal2VisionEncoderLayer(CLIPEncoderLayer):
    def __init__(self, config):
        super().__init__()
-        self.self_attn = MULTIMODAL2_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
        self.mlp = Multimodal2VisionMLP(config)


--- a/examples/modular-transformers/modular_test_detr.py
+++ b/examples/modular-transformers/modular_test_detr.py
@ -0,0 +1,7 @@
+from transformers.models.deformable_detr.modeling_deformable_detr import DeformableDetrModel
+
+
+# Here, the old and new model have by essence a common "detr" suffix. Make sure everything is correctly named
+# in this case (i.e., we do not wrongly detect `Detr` as part of a suffix to remove)
+class TestDetrModel(DeformableDetrModel):
+    pass
--- a/pyproject.toml
+++ b/pyproject.toml
@ -55,3 +55,4 @@ markers = [
 ]
 log_cli = 1
 log_cli_level = "WARNING"
+asyncio_default_fixture_loop_scope = "function"
--- a/setup.py
+++ b/setup.py
@ -149,7 +149,7 @@ _deps = [
    "psutil",
    "pyyaml>=5.1",
    "pydantic",
-    "pytest>=7.2.0,<8.0.0",
+    "pytest>=7.2.0",
    "pytest-asyncio",
    "pytest-rerunfailures",
    "pytest-timeout",
@ -189,7 +189,7 @@ _deps = [
    "tiktoken",
    "timm<=1.0.11",
    "tokenizers>=0.21,<0.22",
-    "torch>=2.1",
+    "torch>=2.1,<2.7",  # Installing torch 2.7 results in slower compiled LLMs. Pinned while we investigate.
    "torchaudio",
    "torchvision",
    "pyctcdecode>=0.4.0",
@ -466,7 +466,7 @@ setup(
    package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx", "py.typed"]},
    zip_safe=False,
    extras_require=extras,
-    entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
+    entry_points={"console_scripts": ["transformers=transformers.commands.transformers_cli:main", "transformers-cli=transformers.commands.transformers_cli:main_cli"]},
    python_requires=">=3.9.0",
    install_requires=list(install_requires),
    classifiers=[
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -261,7 +261,7 @@ _import_structure = {
        "AqlmConfig",
        "AutoRoundConfig",
        "AwqConfig",
-        "BitNetConfig",
+        "BitNetQuantConfig",
        "BitsAndBytesConfig",
        "CompressedTensorsConfig",
        "EetqConfig",
@ -757,7 +757,7 @@ if TYPE_CHECKING:
        AqlmConfig,
        AutoRoundConfig,
        AwqConfig,
-        BitNetConfig,
+        BitNetQuantConfig,
        BitsAndBytesConfig,
        CompressedTensorsConfig,
        EetqConfig,
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -37,15 +37,15 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)

    Args:
        audio (`str` or `np.ndarray`):
-            The audio to be laoded to the numpy array format.
+            The audio to be loaded to the numpy array format.
        sampling_rate (`int`, *optional*, defaults to 16000):
-            The samlping rate to be used when loading the audio. It should be same as the
+            The sampling rate to be used when loading the audio. It should be same as the
            sampling rate the model you will be using further was trained with.
        timeout (`float`, *optional*):
            The timeout value in seconds for the URL request.

    Returns:
-        `np.ndarray`: A numpy artay representing the audio.
+        `np.ndarray`: A numpy array representing the audio.
    """
    requires_backends(load_audio, ["librosa"])

@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
        fft_window_size (`int`, *optional*):
            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
-            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
-            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.

    Example:

--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -359,7 +359,7 @@ class DynamicCache(Cache):
        ```
    """

-    def __init__(self, _distributed_cache_data: Iterable = None) -> None:
+    def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
        super().__init__()
        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
        self.key_cache: List[torch.Tensor] = []
@ -376,7 +376,7 @@ class DynamicCache(Cache):
                self.key_cache.append(key_states)
                self.value_cache.append(value_states)

-    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
        sequence length.
@ -649,7 +649,7 @@ class OffloadedCache(DynamicCache):
            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)

-    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
        if layer_idx < len(self):
            # Evict the previous layer if necessary
@ -1473,7 +1473,7 @@ class EncoderDecoderCache(Cache):
        for layer_idx in range(len(cross_attention_cache.key_cache)):
            self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)

-    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
        sequence length.
@ -1697,7 +1697,7 @@ class HybridCache(Cache):
            min(config.sliding_window, max_cache_len),
            self.head_dim,
        )
-        device = torch.device(device) if device is not None and isinstance(device, str) else None
+        device = torch.device(device) if device is not None else None
        for i in range(config.num_hidden_layers):
            if layer_device_map is not None:
                layer_device = layer_device_map[i]
@ -1919,7 +1919,7 @@ class HybridChunkedCache(Cache):
            full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
            full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
            # Fast decoding path -> here as the effective size is still sliding window, it is extremely important
-            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress
+            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed address
            # in memory (the values are the same as the full states, but not the address!!)
            if key_states.shape[-2] == 1:
                self.key_cache[layer_idx].copy_(full_key_states)
@ -2031,7 +2031,7 @@ class OffloadedHybridCache(HybridChunkedCache):
        self.active_device_layer = 0

    def initialise_cache_layer(self, layer_idx, key_states):
-        """Overriden to use the correct device if offloaded layer (and pin memory)."""
+        """Overridden to use the correct device if offloaded layer (and pin memory)."""
        if len(self.key_cache) > layer_idx:
            return

@ -2243,7 +2243,7 @@ class OffloadedStaticCache(StaticCache):
            The device to offload to. Defaults to CPU.
        layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
-            and the model is splitted between differents gpus. You can know which layers mapped to which device by
+            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.

    Example:
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@ -512,7 +512,7 @@ def duplicate_module(
    new_model_patterns: ModelPatterns,
    dest_file: Optional[str] = None,
    add_copied_from: bool = True,
-    attrs_to_remove: List[str] = None,
+    attrs_to_remove: Optional[List[str]] = None,
 ):
    """
    Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
--- a/src/transformers/commands/chat.py
+++ b/src/transformers/commands/chat.py
@ -358,7 +358,7 @@ class ChatArguments:
    """

    # General settings
-    model_name_or_path: str = field(metadata={"help": "Name of the pre-trained model."})
+    model_name_or_path: Optional[str] = field(default=None, metadata={"help": "Name of the pre-trained model."})
    user: Optional[str] = field(default=None, metadata={"help": "Username to display in chat interface."})
    system_prompt: Optional[str] = field(default=None, metadata={"help": "System prompt."})
    save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history."})
@ -435,9 +435,20 @@ class ChatCommand(BaseTransformersCLICommand):
        """
        dataclass_types = (ChatArguments,)
        chat_parser = parser.add_parser("chat", help=HELP_STRING, dataclass_types=dataclass_types)
+
+        group = chat_parser.add_argument_group("Positional arguments")
+        group.add_argument(
+            "model_name_or_path_positional", type=str, nargs="?", default=None, help="Name of the pre-trained model."
+        )
+
        chat_parser.set_defaults(func=chat_command_factory)

    def __init__(self, args):
+        args.model_name_or_path = args.model_name_or_path_positional or args.model_name_or_path
+
+        if args.model_name_or_path is None:
+            raise ValueError("--model_name_or_path required for chat command.")
+
        self.args = args

    @staticmethod
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@ -73,7 +73,7 @@ class ConvertCommand(BaseTransformersCLICommand):
        finetuning_task_name: str,
        *args,
    ):
-        self._logger = logging.get_logger("transformers-cli/converting")
+        self._logger = logging.get_logger("transformers/converting")

        self._logger.info(f"Loading model {model_type}")
        self._model_type = model_type
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@ -37,7 +37,7 @@ except (ImportError, AttributeError):
    _serve_dependencies_installed = False


-logger = logging.get_logger("transformers-cli/serving")
+logger = logging.get_logger("transformers/serving")


 def serve_command_factory(args: Namespace):
--- a/src/transformers/commands/train.py
+++ b/src/transformers/commands/train.py
@ -91,7 +91,7 @@ class TrainCommand(BaseTransformersCLICommand):
        train_parser.set_defaults(func=train_command_factory)

    def __init__(self, args: Namespace):
-        self.logger = logging.get_logger("transformers-cli/training")
+        self.logger = logging.get_logger("transformers/training")

        self.framework = "tf" if is_tf_available() else "torch"

--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings

 from transformers import HfArgumentParser
 from transformers.commands.add_fast_image_processor import AddFastImageProcessorCommand
@ -24,9 +25,17 @@ from transformers.commands.run import RunCommand
 from transformers.commands.serving import ServeCommand


+def main_cli():
+    warnings.warn(
+        "`transformers-cli` is deprecated in favour of `transformers` directly and will be removed in v5.",
+        DeprecationWarning,
+    )
+    main()
+
+
 def main():
-    parser = HfArgumentParser(prog="Transformers CLI tool", usage="transformers-cli <command> [<args>]")
-    commands_parser = parser.add_subparsers(help="transformers-cli command helpers")
+    parser = HfArgumentParser(prog="Transformers CLI tool", usage="transformers <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="transformers command helpers")

    # Register commands
    ChatCommand.register_subcommand(commands_parser)
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -843,29 +843,20 @@ class PretrainedConfig(PushToHubMixin):
            ):
                serializable_config_dict[key] = value

+        self._remove_keys_not_serialized(serializable_config_dict)
+
+        # Key removed only in diff dict
+        if "_name_or_path" in serializable_config_dict:
+            del serializable_config_dict["_name_or_path"]
+
        if hasattr(self, "quantization_config"):
            serializable_config_dict["quantization_config"] = (
                self.quantization_config.to_dict()
                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
-            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = serializable_config_dict.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(serializable_config_dict)

-        if "_attn_implementation_internal" in serializable_config_dict:
-            del serializable_config_dict["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in serializable_config_dict:
-            del serializable_config_dict["base_model_tp_plan"]
-        # Do not serialize `base_model_pp_plan` for now
-        if "base_model_pp_plan" in serializable_config_dict:
-            del serializable_config_dict["base_model_pp_plan"]
-
-        if "_name_or_path" in serializable_config_dict:
-            del serializable_config_dict["_name_or_path"]
-
        return serializable_config_dict

    def to_dict(self) -> dict[str, Any]:
@ -878,18 +869,6 @@ class PretrainedConfig(PushToHubMixin):
        output = copy.deepcopy(self.__dict__)
        if hasattr(self.__class__, "model_type"):
            output["model_type"] = self.__class__.model_type
-        if "_auto_class" in output:
-            del output["_auto_class"]
-        if "_commit_hash" in output:
-            del output["_commit_hash"]
-        if "_attn_implementation_internal" in output:
-            del output["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in output:
-            del output["base_model_tp_plan"]
-        # Do not serialize `base_model_pp_plan` for now
-        if "base_model_pp_plan" in output:
-            del output["base_model_pp_plan"]

        # Transformers version when serializing the model
        output["transformers_version"] = __version__
@ -902,16 +881,14 @@ class PretrainedConfig(PushToHubMixin):

            output[key] = value

+        self._remove_keys_not_serialized(output)
+
        if hasattr(self, "quantization_config"):
            output["quantization_config"] = (
                self.quantization_config.to_dict()
                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
-
-            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = output.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(output)

        return output
@ -1011,6 +988,31 @@ class PretrainedConfig(PushToHubMixin):
            if isinstance(value, dict):
                self.dict_torch_dtype_to_str(value)

+    def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
+        """
+        Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
+        Runs recursive check on the dict, to remove from all sub configs.
+        """
+        if hasattr(self, "quantization_config"):
+            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
+            _ = d.pop("_pre_quantization_dtype", None)
+
+        if "_auto_class" in d:
+            del d["_auto_class"]
+        if "_commit_hash" in d:
+            del d["_commit_hash"]
+        if "_attn_implementation_internal" in d:
+            del d["_attn_implementation_internal"]
+        # Do not serialize `base_model_tp_plan` for now
+        if "base_model_tp_plan" in d:
+            del d["base_model_tp_plan"]
+        # Do not serialize `base_model_pp_plan` for now
+        if "base_model_pp_plan" in d:
+            del d["base_model_pp_plan"]
+        for value in d.values():
+            if isinstance(value, dict):
+                self._remove_keys_not_serialized(value)
+
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoConfig"):
        """
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -19,6 +19,7 @@ allow to make our dependency on SentencePiece optional.
 """

 import warnings
+from typing import Optional

 from packaging import version
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@ -326,7 +327,9 @@ class OpenAIGPTConverter(Converter):


 class GPT2Converter(Converter):
-    def converted(self, vocab: dict[str, int] = None, merges: list[tuple[str, str]] = None) -> Tokenizer:
+    def converted(
+        self, vocab: Optional[dict[str, int]] = None, merges: Optional[list[tuple[str, str]]] = None
+    ) -> Tokenizer:
        if not vocab:
            vocab = self.original_tokenizer.encoder
        if not merges:
@ -395,7 +398,9 @@ class HerbertConverter(Converter):


 class Qwen2Converter(Converter):
-    def converted(self, vocab: dict[str, int] = None, merges: list[tuple[str, str]] = None) -> Tokenizer:
+    def converted(
+        self, vocab: Optional[dict[str, int]] = None, merges: Optional[list[tuple[str, str]]] = None
+    ) -> Tokenizer:
        if not vocab:
            vocab = self.original_tokenizer.encoder
        if not merges:
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -24,7 +24,7 @@ from filelock import FileLock
 from torch.utils.data import Dataset

 from ...tokenization_utils_base import PreTrainedTokenizerBase
-from ...utils import logging
+from ...utils import check_torch_load_is_safe, logging
 from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
 from ..processors.utils import InputFeatures

@ -122,6 +122,7 @@ class GlueDataset(Dataset):
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
+                check_torch_load_is_safe()
                self.features = torch.load(cached_features_file, weights_only=True)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@ -24,7 +24,7 @@ from torch.utils.data import Dataset

 from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ...utils import check_torch_load_is_safe, logging
 from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features


@ -148,6 +148,7 @@ class SquadDataset(Dataset):
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
+                check_torch_load_is_safe()
                self.old_features = torch.load(cached_features_file, weights_only=True)

                # Legacy cache files have only features, while new cache files
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@ -80,7 +80,7 @@ class DebugUnderflowOverflow:
    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
    around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
-    64K, and we get an overlow.
+    64K, and we get an overflow.

    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
    fp16 numbers.
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -55,7 +55,7 @@ deps = {
    "psutil": "psutil",
    "pyyaml": "pyyaml>=5.1",
    "pydantic": "pydantic",
-    "pytest": "pytest>=7.2.0,<8.0.0",
+    "pytest": "pytest>=7.2.0",
    "pytest-asyncio": "pytest-asyncio",
    "pytest-rerunfailures": "pytest-rerunfailures",
    "pytest-timeout": "pytest-timeout",
@ -91,7 +91,7 @@ deps = {
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.11",
    "tokenizers": "tokenizers>=0.21,<0.22",
-    "torch": "torch>=2.1",
+    "torch": "torch>=2.1,<2.7",
    "torchaudio": "torchaudio",
    "torchvision": "torchvision",
    "pyctcdecode": "pyctcdecode>=0.4.0",
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@ -151,17 +151,24 @@ def get_imports(filename: Union[str, os.PathLike]) -> list[str]:
        content = f.read()
    imported_modules = set()

+    import transformers.utils
+
    def recursive_look_for_imports(node):
        if isinstance(node, ast.Try):
-            return  #  Don't recurse into Try blocks and ignore imports in them
+            return  # Don't recurse into Try blocks and ignore imports in them
        elif isinstance(node, ast.If):
            test = node.test
            for condition_node in ast.walk(test):
-                if isinstance(condition_node, ast.Call) and getattr(condition_node.func, "id", "").startswith(
-                    "is_flash_attn"
-                ):
-                    # Don't recurse into "if flash_attn_available()" blocks and ignore imports in them
-                    return
+                if isinstance(condition_node, ast.Call):
+                    check_function = getattr(condition_node.func, "id", "")
+                    if (
+                        check_function.endswith("available")
+                        and check_function.startswith("is_flash_attn")
+                        or hasattr(transformers.utils.import_utils, check_function)
+                    ):
+                        # Don't recurse into "if flash_attn_available()" or any "if library_available" blocks
+                        # that appears in `transformers.utils.import_utils` and ignore imports in them
+                        return
        elif isinstance(node, ast.Import):
            # Handle 'import x' statements
            for alias in node.names:
--- a/Show More
+++ b/Show More