Merge branch 'main' into add-aimv2-model

2025-07-31 02:02:21 +06:00 · 2025-04-23 20:42:13 +05:30 · 2025-04-23 20:42:13 +05:30 · 7608977498
commit 7608977498
parent 9af3764771 63c6331387
275 changed files with 10100 additions and 4401 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -56,6 +56,12 @@ body:
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
+        
+        Devices/Backends:
+        
+          - AMD ROCm: @ivarflakstad
+          - Intel XPU: @IlyasMoutawwakil
+          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
+      languages: en
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -84,6 +84,9 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
 # Add AMD Quark for quantization testing
 RUN python3 -m pip install --no-cache-dir amd-quark

+# Add AutoRound for quantization testing
+RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
+
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -167,6 +167,8 @@
    title: Quantization concepts
  - local: quantization/aqlm
    title: AQLM
+  - local: quantization/auto_round
+    title: AutoRound
  - local: quantization/awq
    title: AWQ
  - local: quantization/bitnet
@ -491,8 +493,6 @@
        title: GraniteMoe
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
-      - local: model_doc/granitevision
-        title: GraniteVision
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
@ -513,8 +513,6 @@
        title: Llama2
      - local: model_doc/llama3
        title: Llama3
-      - local: model_doc/llama4
-        title: Llama4
      - local: model_doc/longformer
        title: Longformer
      - local: model_doc/longt5
@ -543,8 +541,6 @@
        title: MegatronGPT2
      - local: model_doc/mistral
        title: Mistral
-      - local: model_doc/mistral3
-        title: Mistral3
      - local: model_doc/mixtral
        title: Mixtral
      - local: model_doc/mluke
@ -595,8 +591,6 @@
        title: Phi
      - local: model_doc/phi3
        title: Phi-3
-      - local: model_doc/phi4_multimodal
-        title: Phi4 Multimodal
      - local: model_doc/phimoe
        title: PhiMoE
      - local: model_doc/phobert
@ -939,6 +933,8 @@
        title: GIT
      - local: model_doc/got_ocr2
        title: GOT-OCR2
+      - local: model_doc/granitevision
+        title: GraniteVision
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
@ -953,6 +949,8 @@
        title: InstructBLIP
      - local: model_doc/instructblipvideo
        title: InstructBlipVideo
+      - local: model_doc/internvl
+        title: InternVL
      - local: model_doc/janus
        title: Janus
      - local: model_doc/kosmos-2
@ -967,6 +965,8 @@
        title: LayoutXLM
      - local: model_doc/lilt
        title: LiLT
+      - local: model_doc/llama4
+        title: Llama4
      - local: model_doc/llava
        title: Llava
      - local: model_doc/llava_next
@ -981,6 +981,8 @@
        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
+      - local: model_doc/mistral3
+        title: Mistral3
      - local: model_doc/mllama
        title: mllama
      - local: model_doc/nougat
@ -997,6 +999,8 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
      - local: model_doc/pix2struct
        title: Pix2Struct
      - local: model_doc/pixtral
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -31,7 +31,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("I look forward to", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to default length because Llama2 generation length is 4096
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -28,7 +28,7 @@ Most of those are only useful if you are adding new models in the library.

 This context manager is a power user tool intended for model adders.
 It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-To note, this context manager enforces `torch.inference_mode()`.
+To note, this context manager enforces `torch.no_grad()`.

 ### Rationale

@ -43,6 +43,7 @@ import torch
 from PIL import Image
 import requests
 from transformers import LlavaProcessor, LlavaForConditionalGeneration
+from transformers.model_debugging_utils import model_addition_debugger_context
 torch.random.manual_seed(673)

 # load pretrained model and processor
@ -60,12 +61,153 @@ prompt = "<image>Describe this image."
 inputs = processor(text=prompt, images=random_image, return_tensors="pt")

 # call forward method (not .generate!)
-with model_addition_debugger_context(model, "optional_path_to_your_output_file.json"):
+with model_addition_debugger_context(
+  model,
+  debug_path="optional_path_to_your_directory",
+  do_prune_layers=False # This will output ALL the layers of a model.
+  ):
    output = model.forward(**inputs)

 ```


-[[autodoc]] model_addition_debugger
+### Reading results
+
+The debugger generates two files from the forward call, both with the same base name, 
+but ending either with `_SUMMARY.json` or with `_FULL_TENSORS.json`. 
+
+The first one will contain a summary of each module's _input_ and _output_ tensor values and shapes.
+
+```json
+{
+  "module_path": "MolmoForConditionalGeneration",
+  "inputs": {
+    "args": [],
+    "kwargs": {
+      "input_ids": {
+        "shape": "torch.Size([1, 589])",
+        "dtype": "torch.int64"
+      },
+      "attention_mask": {
+        "shape": "torch.Size([1, 589])",
+        "dtype": "torch.int64"
+      },
+      "pixel_values": {
+        "shape": "torch.Size([1, 5, 576, 588])",
+        "dtype": "torch.float32",
+        "mean": "tensor(-8.9514e-01, device='cuda:0')",
+        "std": "tensor(9.2586e-01, device='cuda:0')",
+        "min": "tensor(-1.7923e+00, device='cuda:0')",
+        "max": "tensor(1.8899e+00, device='cuda:0')"
+    }
+  },
+  "children": [
+    {
+      "module_path": "MolmoForConditionalGeneration.language_model.model.embed_tokens",
+      "inputs": {
+        "args": [
+          {
+            "shape": "torch.Size([1, 589])",
+            "dtype": "torch.int64"
+          }
+        ]
+      },
+      "outputs": {
+        "shape": "torch.Size([1, 589, 3584])",
+        "dtype": "torch.float32",
+        "mean": "tensor(6.5460e-06, device='cuda:0')",
+        "std": "tensor(2.3807e-02, device='cuda:0')",
+        "min": "tensor(-3.3398e-01, device='cuda:0')",
+        "max": "tensor(3.9453e-01, device='cuda:0')"
+      }
+    },
+    {
+      "module_path": "MolmoForConditionalGeneration.vision_tower",
+      "inputs": {
+        "args": [
+          {
+            "shape": "torch.Size([5, 1, 576, 588])",
+            "dtype": "torch.float32",
+            "mean": "tensor(-8.9514e-01, device='cuda:0')",
+            "std": "tensor(9.2586e-01, device='cuda:0')",
+            "min": "tensor(-1.7923e+00, device='cuda:0')",
+            "max": "tensor(1.8899e+00, device='cuda:0')"
+          }
+        ],
+        "kwargs": {
+          "output_hidden_states": "True"
+        }
+      },
+      "children": [
+        { ... and so on
+```
+
+The `_FULL_TENSORS.json` file will display a full view of all tensors, which is useful
+for comparing two files. 
+```json
+      "pixel_values": {
+        "shape": "torch.Size([1, 5, 576, 588])",
+        "dtype": "torch.float32",
+        "value": [
+          "tensor([[[[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
+          "",
+          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-1.4857e+00, -1.4820e+00, -1.2100e+00,  ..., -6.0979e-01, -5.9650e-01, -3.8527e-01],",
+          "          [-1.6755e+00, -1.7221e+00, -1.4518e+00,  ..., -7.5577e-01, -7.4658e-01, -5.5592e-01],",
+          "          [-7.9957e-01, -8.2162e-01, -5.7014e-01,  ..., -1.3689e+00, -1.3169e+00, -1.0678e+00]],",
+          "",
+          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-3.0322e-01, -5.0645e-01, -5.8436e-01,  ..., -6.2439e-01, -7.9160e-01, -8.1188e-01],",
+          "          [-4.4921e-01, -6.5653e-01, -7.2656e-01,  ..., -3.4702e-01, -5.2146e-01, -5.1326e-01],",
+          "          [-3.4702e-01, -5.3647e-01, -5.4170e-01,  ..., -1.0915e+00, -1.1968e+00, -1.0252e+00]],",
+          "",
+          "         [[-1.1207e+00, -1.2718e+00, -1.0678e+00,  ..., 1.2013e-01, -1.3126e-01, -1.7197e-01],",
+          "          [-6.9738e-01, -9.1166e-01, -8.5454e-01,  ..., -5.5050e-02, -2.8134e-01, -4.2793e-01],",
+          "          [-3.4702e-01, -5.5148e-01, -5.8436e-01,  ..., 1.9312e-01, -8.6235e-02, -2.1463e-01],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
+          "",
+          "         [[-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.4711e+00, -1.4219e+00, -1.1389e+00],",
+          "          [-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.7193e+00, -1.6771e+00, -1.4091e+00],",
+          "          [-1.6317e+00, -1.6020e+00, -1.2669e+00,  ..., -1.2667e+00, -1.2268e+00, -8.9720e-01],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]]]], device='cuda:0')"
+        ],
+        "mean": "tensor(-8.9514e-01, device='cuda:0')",
+        "std": "tensor(9.2586e-01, device='cuda:0')",
+        "min": "tensor(-1.7923e+00, device='cuda:0')",
+        "max": "tensor(1.8899e+00, device='cuda:0')"
+      },
+```
+
+### Comparing between implementations
+
+Once the forward passes of two models have been traced by the debugger, one can compare the `json` output files. See below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong. 
+
+
+![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/files_difference_debugging.png)
+
+
+### Limitations and scope
+
+This feature will only work for torch-based models, and would require more work and case-by-case approach for say `jax`-based models that are usually compiled. Models relying heavily on external kernel calls may work, but trace will probably miss some things. Regardless, any python implementation that aims at mimicking another implementation can be traced once instead of reran N times with breakpoints.
+
+If you pass `do_prune_layers=False` to your model debugger, ALL the layers will be outputted to `json`. Else, only the first and last layer will be shown. This is useful when some layers (typically cross-attention) appear only after N layers. 

 [[autodoc]] model_addition_debugger_context
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -20,6 +20,10 @@ This page lists all the custom layers used by the library, as well as the utilit

 Most of those are only useful if you are studying the code of the models in the library.

+## Layers
+
+[[autodoc]] GradientCheckpointingLayer
+
 ## Attention Functions

 [[autodoc]] AttentionInterface
@ -33,23 +37,6 @@ Most of those are only useful if you are studying the code of the models in the

 [[autodoc]] pytorch_utils.Conv1D

-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
 ## PyTorch Helper Functions

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time.

-A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works.
+A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation) doc for a more detailed explanation about how a cache works.

 Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -92,3 +92,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## QuarkConfig

 [[autodoc]] QuarkConfig
+
+## AutoRoundConfig
+
+[[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -0,0 +1,350 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
+</div>
+
+# InternVL
+
+The InternVL3 family of Visual Language Models was introduced in [InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models](https://huggingface.co/papers/2504.10479).
+
+The abstract from the paper is the following:
+
+*We introduce InternVL3, a significant advancement in the InternVL series featuring a native multimodal pre-training paradigm. Rather than adapting a text-only large language model (LLM) into a multimodal large language model (MLLM) that supports visual inputs, InternVL3 jointly acquires multimodal and linguistic capabilities from both diverse multimodal data and pure-text corpora during a single pre-training stage. This unified training paradigm effectively addresses the complexities and alignment challenges commonly encountered in conventional post-hoc training pipelines for MLLMs. To further improve performance and scalability, InternVL3 incorporates variable visual position encoding (V2PE) to support extended multimodal contexts, employs advanced post-training techniques such as supervised fine-tuning (SFT) and mixed preference optimization (MPO), and adopts test-time scaling strategies alongside an optimized training infrastructure. Extensive empirical evaluations demonstrate that InternVL3 delivers superior performance across a wide range of multi-modal tasks. In particular, InternVL3-78B achieves a score of 72.2 on the MMMU benchmark, setting a new state-of-the-art among open-source MLLMs. Its capabilities remain highly competitive with leading proprietary models, including ChatGPT-4o, Claude 3.5 Sonnet, and Gemini 2.5 Pro, while also maintaining strong pure-language proficiency. In pursuit of open-science principles, we will publicly release both the training data and model weights to foster further research and development in next-generation MLLMs.*
+
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/internvl_architecture.png" alt="drawing" width="600"/>
+
+<small> Overview of InternVL3 models architecture, which is the same as InternVL2.5. Taken from the <a href="https://huggingface.co/OpenGVLab/InternVL3-1B">original checkpoint.</a> </small>
+
+
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/internvl_overview_performance.png" alt="drawing" width="600"/>
+
+<small> Comparison of InternVL3 performance on OpenCompass against other SOTA VLLMs. Taken from the <a href="https://huggingface.co/OpenGVLab/InternVL3-1B">original checkpoint.</a> </small>
+
+
+
+This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan).
+The original code can be found [here](https://github.com/OpenGVLab/InternVL).
+
+## Usage example
+
+### Inference with Pipeline
+
+Here is how you can use the `image-text-to-text` pipeline to perform inference with the `InternVL3` models in just a few lines of code:
+
+```python
+>>> from transformers import pipeline
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {
+...                 "type": "image",
+...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+...             },
+...             {"type": "text", "text": "Describe this image."},
+...         ],
+...     },
+... ]
+
+>>> pipe = pipeline("image-text-to-text", model="OpenGVLab/InternVL3-1B-hf")
+>>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
+>>> outputs[0]["generated_text"]
+'The image showcases a vibrant scene of nature, featuring several flowers and a bee. \n\n1. **Foreground Flowers**: \n   - The primary focus is on a large, pink cosmos flower with a prominent yellow center. The petals are soft and slightly r'
+```
+### Inference on a single image
+
+This example demonstrates how to perform inference on a single image with the InternVL models using chat templates.
+
+> [!NOTE]
+> Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+...             {"type": "text", "text": "Please describe the image explicitly."},
+...         ],
+...     }
+... ]
+
+>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> generate_ids = model.generate(**inputs, max_new_tokens=50)
+>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+>>> decoded_output
+'The image shows two cats lying on a pink blanket. The cat on the left is a tabby with a mix of brown, black, and white fur, and it appears to be sleeping with its head resting on the blanket. The cat on the'
+```
+
+### Text-only generation
+This example shows how to generate text using the InternVL model without providing any image input.
+
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "text", "text": "Write a haiku"},
+...         ],
+...     }
+... ]
+
+>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+
+>>> generate_ids = model.generate(**inputs, max_new_tokens=50)
+>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+>>> print(decoded_output)
+"Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins."
+```
+
+### Batched image and text inputs
+InternVL models also support batched image and text inputs.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+...                 {"type": "text", "text": "Describe this image"},
+...             ],
+...         },
+...     ],
+... ]
+
+
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> output = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
+["user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace.",
+ 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of']
+```
+
+### Batched multi-image input
+This implementation of the InternVL models supports batched text-images inputs with different number of images for each text.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+...             ],
+...         },
+...     ],
+>>> ]
+
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> output = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
+["user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace.",
+ 'user\n\n\nThese images depict two different landmarks. Can you identify them?\nassistant\nYes, these images depict the Statue of Liberty and the Golden Gate Bridge.']
+```
+
+### Video input
+InternVL models can also handle video inputs. Here is an example of how to perform inference on a video input using chat templates.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+
+>>> model_checkpoint = "OpenGVLab/InternVL3-8B-hf"
+>>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, quantization_config=quantization_config)
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {
+...                 "type": "video",
+...                 "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
+...             },
+...             {"type": "text", "text": "What type of shot is the man performing?"},
+...         ],
+...     }
+>>> ]
+>>> inputs = processor.apply_chat_template(
+...     messages,
+...     return_tensors="pt",
+...     add_generation_prompt=True,
+...     tokenize=True,
+...     return_dict=True,
+...     num_frames=8,
+>>> ).to(model.device, dtype=torch.float16)
+
+>>> output = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+>>> decoded_output
+'The man is performing a forehand shot.'
+```
+
+### Interleaved image and video inputs
+This example showcases how to handle a batch of chat conversations with interleaved image and video inputs using chat template.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "OpenGVLab/InternVL3-1B-hf"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "video", "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"},
+...                 {"type": "text", "text": "What type of shot is the man performing?"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+>>> ]
+>>> inputs = processor.apply_chat_template(
+...     messages,
+...     padding=True,
+...     add_generation_prompt=True,
+...     tokenize=True,
+...     return_dict=True,
+...     return_tensors="pt",
+>>> ).to(model.device, dtype=torch.bfloat16)
+
+>>> outputs = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_outputs = processor.batch_decode(outputs, skip_special_tokens=True)
+>>> decoded_outputs
+['user\n\n\nThese images depict two different landmarks. Can you identify them?\nassistant\nThe images depict the Statue of Liberty and the Golden Gate Bridge.',
+ 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
+ "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."]
+```
+
+## InternVLVisionConfig
+
+[[autodoc]] InternVLVisionConfig
+
+## InternVLConfig
+
+[[autodoc]] InternVLConfig
+
+## InternVLVisionModel
+
+[[autodoc]] InternVLVisionModel
+    - forward
+
+## InternVLForConditionalGeneration
+
+[[autodoc]] InternVLForConditionalGeneration
+    - forward
+
+## InternVLProcessor
+
+[[autodoc]] InternVLProcessor
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -1,5 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

@ -9,93 +8,95 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
+</div>
+
 # Longformer

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-</div>
+[Longformer](https://huggingface.co/papers/2004.05150) is a transformer model designed for processing long documents. The self-attention operation usually scales quadratically with sequence length, preventing transformers from processing longer sequences. The Longformer attention mechanism overcomes this by scaling linearly with sequence length. It combines local windowed attention with task-specific global attention, enabling efficient processing of documents with thousands of tokens.

-## Overview
+You can find all the original Longformer checkpoints under the [Ai2](https://huggingface.co/allenai?search_models=longformer) organization.

-The Longformer model was presented in [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+> [!TIP]
+> Click on the Longformer models in the right sidebar for more examples of how to apply Longformer to different language tasks.

-The abstract from the paper is the following:
+The example below demonstrates how to fill the `<mask>` token with [`Pipeline`], [`AutoModel`] and from the command line.

-*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
-quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
-mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
-longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
-windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
-evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
-contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
-pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
-WikiHop and TriviaQA.*
-
-This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
-
-## Usage tips
-
- Since the Longformer is based on RoBERTa, it doesn't have `token_type_ids`. You don't need to indicate which
-  token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or
-  `</s>`).
- A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the local attention section for more information.
-
-## Longformer Self Attention
-
-Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
-attend "locally" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and
-\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in
-`config.attention_window`. Note that `config.attention_window` can be of type `List` to define a
-different \\(w\\) for each layer. A selected few tokens attend "globally" to all other tokens, as it is
-conventionally done for all tokens in `BertSelfAttention`.
-
-Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
-that every "locally" attending token not only attends to tokens within its window \\(w\\), but also to all "globally"
-attending tokens so that global attention is *symmetric*.
-
-The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
-`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
-`global_attention_mask`:
-
- 0: the token attends "locally",
- 1: the token attends "globally".
-
-For more information please also refer to [`~LongformerModel.forward`] method.
-
-Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually
-represents the memory and time bottleneck, can be reduced from \\(\mathcal{O}(n_s \times n_s)\\) to
-\\(\mathcal{O}(n_s \times w)\\), with \\(n_s\\) being the sequence length and \\(w\\) being the average window
-size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of
-"locally" attending tokens.
-
-For more information, please refer to the official [paper](https://arxiv.org/pdf/2004.05150.pdf).
-
-
-## Training
-
-[`LongformerForMaskedLM`] is trained the exact same way [`RobertaForMaskedLM`] is
-trained and should be used as follows:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
-input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
-mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+import torch
+from transformers import pipeline

-loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+pipeline = pipeline(
+    task="fill-mask",
+    model="allenai/longformer-base-4096",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("""San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee.
+Spencer, a fifth-year pro, will be placed on injured reserve soon after undergoing surgery Wednesday to repair the ligament. He injured his knee late in the 49ers’ road victory at Seattle on Sept. 14, and missed last week’s victory over Detroit.
+Tarell Brown and Donald Strickland will compete to replace Spencer with the 49ers, who kept 12 defensive backs on their 53-man roster to start the season. Brown, a second-year pro, got his first career interception last weekend while filling in for Strickland, who also sat out with a knee injury.""")
 ```

-## Resources
+</hfoption>
+<hfoption id="AutoModel">

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+```python
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
+model = AutoModelForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+
+text = (
+"""
+San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee.
+Spencer, a fifth-year pro, will be placed on injured reserve soon after undergoing surgery Wednesday to repair the ligament. He injured his knee late in the 49ers’ road victory at Seattle on Sept. 14, and missed last week’s victory over Detroit.
+Tarell Brown and Donald Strickland will compete to replace Spencer with the 49ers, who kept 12 defensive backs on their 53-man roster to start the season. Brown, a second-year pro, got his first career interception last weekend while filling in for Strickland, who also sat out with a knee injury.
+"""
+)
+
+input_ids = tokenizer([text], return_tensors="pt")["input_ids"]
+logits = model(input_ids).logits
+
+masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+probs = logits[0, masked_index].softmax(dim=0)
+values, predictions = probs.topk(5)
+tokenizer.decode(predictions).split()
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
+```
+
+</hfoption>
+</hfoptions
+
+
+## Notes
+
+- Longformer is based on [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta) and doesn't have `token_type_ids`. You don't need to indicate which token belongs to which segment. You only need to separate the segments with the separation token `</s>` or `tokenizer.sep_token`.
+- You can set which tokens can attend locally and which tokens attend globally with the `global_attention_mask` at inference (see this [example](https://huggingface.co/docs/transformers/en/model_doc/longformer#transformers.LongformerModel.forward.example) for more details). A value of `0` means a token attends locally and a value of `1` means a token attends globally.
+- [`LongformerForMaskedLM`] is trained like [`RobertaForMaskedLM`] and should be used as shown below.
+
+  ```py
+    input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
+    mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+    loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+    ```

 ## LongformerConfig

@ -139,9 +140,6 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 [[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput

-<frameworkcontent>
-<pt>
-
 ## LongformerModel

 [[autodoc]] LongformerModel
@ -149,45 +147,42 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 ## LongformerForMaskedLM

-[[autodoc]] LongformerForMaskedLM
+[[autodoc]] LongformerForMaskedLM 
    - forward

 ## LongformerForSequenceClassification

-[[autodoc]] LongformerForSequenceClassification
+[[autodoc]] LongformerForSequenceClassification 
    - forward

 ## LongformerForMultipleChoice

-[[autodoc]] LongformerForMultipleChoice
+[[autodoc]] LongformerForMultipleChoice 
    - forward

 ## LongformerForTokenClassification

-[[autodoc]] LongformerForTokenClassification
+[[autodoc]] LongformerForTokenClassification 
    - forward

 ## LongformerForQuestionAnswering

-[[autodoc]] LongformerForQuestionAnswering
+[[autodoc]] LongformerForQuestionAnswering 
    - forward

-</pt>
-<tf>
-
 ## TFLongformerModel

-[[autodoc]] TFLongformerModel
+[[autodoc]] TFLongformerModel    
    - call

 ## TFLongformerForMaskedLM

-[[autodoc]] TFLongformerForMaskedLM
+[[autodoc]] TFLongformerForMaskedLM 
    - call

 ## TFLongformerForQuestionAnswering

-[[autodoc]] TFLongformerForQuestionAnswering
+[[autodoc]] TFLongformerForQuestionAnswering 
    - call

 ## TFLongformerForSequenceClassification
@ -197,13 +192,10 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 ## TFLongformerForTokenClassification

-[[autodoc]] TFLongformerForTokenClassification
+[[autodoc]] TFLongformerForTokenClassification 
    - call

 ## TFLongformerForMultipleChoice

-[[autodoc]] TFLongformerForMultipleChoice
+[[autodoc]] TFLongformerForMultipleChoice 
    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@ -14,154 +14,105 @@ rendered properly in your Markdown viewer.

 -->

-# MBart and MBart-50
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>

+# mBART

-## Overview of MBart
+[mBART](https://huggingface.co/papers/2001.08210) is a multilingual machine translation model that pretrains the entire translation model (encoder-decoder) unlike previous methods that only focused on parts of the model. The model is trained on a denoising objective which reconstructs the corrupted text. This allows mBART to handle the source language and the target text to translate to.

-The MBart model was presented in [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan
-Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+[mBART-50](https://huggingface.co/paper/2008.00401) is pretrained on an additional 25 languages.

-According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
-corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
-sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
-on the encoder, decoder, or reconstructing parts of the text.
+You can find all the original mBART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=mbart) organization.

-This model was contributed by [valhalla](https://huggingface.co/valhalla). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/mbart)
+> [!TIP]
+> Click on the mBART models in the right sidebar for more examples of applying mBART to different language tasks.

-### Training of MBart
+The example below demonstrates how to translate text with [`Pipeline`] or the [`AutoModel`] class.

-MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
-target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
-keyword, and target text format passed with the `text_label` keyword argument.
+```py
+import torch
+from transformers import pipeline

- Supervised training
-
-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
->>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
->>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
->>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
-
->>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
->>> # forward pass
->>> model(**inputs)
+pipeline = pipeline(
+    task="translation",
+    model="facebook/mbart-large-50-many-to-many-mmt",
+    device=0,
+    torch_dtype=torch.float16,
+    src_lang="en_XX",
+    tgt_lang="fr_XX",
+)
+print(pipeline("UN Chief Says There Is No Military Solution in Syria"))
 ```

- Generation
+</hfoption>
+<hfoption id="AutoModel">

-  While generating the target text set the `decoder_start_token_id` to the target language id. The following
-  example shows how to translate English to Romanian using the *facebook/mbart-large-en-ro* model.
+```py
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
+article_en = "UN Chief Says There Is No Military Solution in Syria"

->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
->>> article = "UN Chief Says There Is No Military Solution in Syria"
->>> inputs = tokenizer(article, return_tensors="pt")
->>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-"Şeful ONU declară că nu există o soluţie militară în Siria"
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+
+tokenizer.src_lang = "en_XX"
+encoded_hi = tokenizer(article_en, return_tensors="pt").to("cuda")
+generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"], cache_implementation="static")
+print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
 ```

-## Overview of MBart-50
+</hfoption>
+</hfoptions>

-MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extending
-its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
-languages.
+## Notes

-According to the abstract
+- You can check the full list of language codes via `tokenizer.lang_code_to_id.keys()`.
+- mBART requires a special language id token in the source and target text during training. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The target text format is `[tgt_lang_code] X [eos]`. The `bos` token is never used. The [`~PreTrainedTokenizerBase._call_`] encodes the source text format passed as the first argument or with the `text` keyword. The target text format is passed with the `text_label` keyword.
+- Set the `decoder_start_token_id` to the target language id for mBART.

-*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
-direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
-can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
-average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
-improving 9.3 BLEU on average over bilingual baselines from scratch.*
+    ```py
+    import torch
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-en-ro", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")

-### Training of MBart-50
+    article = "UN Chief Says There Is No Military Solution in Syria"
+    inputs = tokenizer(article, return_tensors="pt")

-The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
-for both source and target text i.e the text format is `[lang_code] X [eos]`, where `lang_code` is source
-language id for source text and target language id for target text, with `X` being the source or target text
-respectively.
+    translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+    tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    ```

+- mBART-50 has a different text format. The language id token is used as the prefix for the source and target text. The text format is `[lang_code] X [eos]` where `lang_code` is the source language id for the source text and target language id for the target text. `X` is the source or target text respectively.
+- Set the `eos_token_id` as the `decoder_start_token_id` for mBART-50. The target language id is used as the first generated token by passing `forced_bos_token_id` to [`~GenerationMixin.generate`].

-MBart-50 has its own tokenizer [`MBart50Tokenizer`].
+    ```py
+    import torch
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-  Supervised training
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+    article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
+    tokenizer.src_lang = "ar_AR"

-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-
-src_text = " UN Chief Says There Is No Military Solution in Syria"
-tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-
-model(**model_inputs)  # forward pass
-```
-
- Generation
-
-  To generate using the mBART-50 multilingual translation models, `eos_token_id` is used as the
-  `decoder_start_token_id` and the target language id is forced as the first generated token. To force the
-  target language id as the first generated token, pass the *forced_bos_token_id* parameter to the *generate* method.
-  The following example shows how to translate between Hindi to French and Arabic to English using the
-  *facebook/mbart-50-large-many-to-many* checkpoint.
-
-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
-article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
-
-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-
-# translate Hindi to French
-tokenizer.src_lang = "hi_IN"
-encoded_hi = tokenizer(article_hi, return_tensors="pt")
-generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
-
-# translate Arabic to English
-tokenizer.src_lang = "ar_AR"
-encoded_ar = tokenizer(article_ar, return_tensors="pt")
-generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "The Secretary-General of the United Nations says there is no military solution in Syria."
-```
-
-## Documentation resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+    encoded_ar = tokenizer(article_ar, return_tensors="pt")
+    generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    ```

 ## MBartConfig

@ -253,4 +204,4 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    - decode

 </jax>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -13,166 +13,117 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # Phi

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[Phi](https://huggingface.co/papers/2306.11644) is a 1.3B parameter transformer model optimized for Python code generation. It focuses on "textbook-quality" training data of code examples, exercises and synthetic Python problems rather than scaling the model size or compute.

-## Overview
+You can find all the original Phi checkpoints under the [Phi-1](https://huggingface.co/collections/microsoft/phi-1-6626e29134744e94e222d572) collection.

-The Phi-1 model was proposed in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li.
+> [!TIP]
+> Click on the Phi models in the right sidebar for more examples of how to apply Phi to different language tasks.

-The Phi-1.5 model was proposed in [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`] and from the command line.

-### Summary
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-In Phi-1 and Phi-1.5 papers, the authors showed how important the quality of the data is in training relative to the model size.
-They selected high quality "textbook" data alongside with synthetically generated data for training their small sized Transformer
-based model Phi-1 with 1.3B parameters. Despite this small scale, phi-1 attains pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP.
-They follow the same strategy for Phi-1.5 and created another 1.3B parameter model with performance on natural language tasks comparable
-to models 5x larger, and surpassing most non-frontier LLMs. Phi-1.5 exhibits many of the traits of much larger LLMs such as the ability
-to “think step by step” or perform some rudimentary in-context learning.
-With these two experiments the authors successfully showed the huge impact of quality of training data when training machine learning models.
+```py
+import torch
+from transformers import pipeline

-The abstract from the Phi-1 paper is the following:
+pipeline = pipeline(task="text-generation", model="microsoft/phi-1.5", device=0, torch_dtype=torch.bfloat16)
+pipeline("pipeline('''def print_prime(n): """ Print all primes between 1 and n"""''')")

-*We introduce phi-1, a new large language model for code, with significantly smaller size than
-competing models: phi-1 is a Transformer-based model with 1.3B parameters, trained for 4 days on
-8 A100s, using a selection of “textbook quality” data from the web (6B tokens) and synthetically
-generated textbooks and exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains
-pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays surprising emergent
-properties compared to phi-1-base, our model before our finetuning stage on a dataset of coding
-exercises, and phi-1-small, a smaller model with 350M parameters trained with the same pipeline as
-phi-1 that still achieves 45% on HumanEval.*
-
-The abstract from the Phi-1.5 paper is the following:
-
-*We continue the investigation into the power of smaller Transformer-based language models as
-initiated by TinyStories – a 10 million parameter model that can produce coherent English – and
-the follow-up work on phi-1, a 1.3 billion parameter model with Python coding performance close
-to the state-of-the-art. The latter work proposed to use existing Large Language Models (LLMs) to
-generate “textbook quality” data as a way to enhance the learning process compared to traditional
-web data. We follow the “Textbooks Are All You Need” approach, focusing this time on common
-sense reasoning in natural language, and create a new 1.3 billion parameter model named phi-1.5,
-with performance on natural language tasks comparable to models 5x larger, and surpassing most
-non-frontier LLMs on more complex reasoning tasks such as grade-school mathematics and basic
-coding. More generally, phi-1.5 exhibits many of the traits of much larger LLMs, both good –such
-as the ability to “think step by step” or perform some rudimentary in-context learning– and bad,
-including hallucinations and the potential for toxic and biased generations –encouragingly though, we
-are seeing improvement on that front thanks to the absence of web data. We open-source phi-1.5 to
-promote further research on these urgent topics.*
-
-This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
-
-The original code for Phi-1, Phi-1.5 and Phi-2 can be found [here](https://huggingface.co/microsoft/phi-1), [here](https://huggingface.co/microsoft/phi-1_5) and [here](https://huggingface.co/microsoft/phi-2), respectively.
-
-## Usage tips
-
- This model is quite similar to `Llama` with the main difference in [`PhiDecoderLayer`], where they used [`PhiAttention`] and [`PhiMLP`] layers in parallel configuration.
- The tokenizer used for this model is identical to the [`CodeGenTokenizer`].
-
-## How to use Phi-2
-
-<Tip warning={true}>
-
-Phi-2 has been integrated in the development version (4.37.0.dev) of `transformers`. Until the official version is released through `pip`, ensure that you are doing one of the following:
-
-* When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
-
-* Update your local `transformers` to the development version: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`. The previous command is an alternative to cloning and installing from the source.
-
-</Tip>
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
-
->>> inputs = tokenizer('Can you help me write a formal email to a potential business partner proposing a joint venture?', return_tensors="pt", return_attention_mask=False)
-
->>> outputs = model.generate(**inputs, max_length=30)
->>> text = tokenizer.batch_decode(outputs)[0]
->>> print(text)
-Can you help me write a formal email to a potential business partner proposing a joint venture?
-Input: Company A: ABC Inc.
-Company B
 ```

-### Example :
+</hfoption>

-```python
->>> from transformers import PhiForCausalLM, AutoTokenizer
+<hfoption id="AutoModel">

->>> # define the model and tokenizer.
->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5")
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM

->>> # feel free to change the prompt to your liking.
->>> prompt = "If I were an AI that had just achieved"
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")

->>> # apply the tokenizer.
->>> tokens = tokenizer(prompt, return_tensors="pt")
+input_ids = tokenizer('''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """''', return_tensors="pt").to("cuda")

->>> # use the model to generate new tokens.
->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
-
->>> tokenizer.batch_decode(generated_output)[0]
-'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

-## Combining Phi and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+</hfoption>
+<hfoption id="transformers-cli">

 ```bash
-pip install -U flash-attn --no-build-isolation
+echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
 ```

-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+</hfoption>
+</hfoptions>

-To load and run a model using Flash Attention 2, refer to the snippet below:
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-```python
->>> import torch
->>> from transformers import PhiForCausalLM, AutoTokenizer
+The example below uses [bitsandbytes](https://huggingface.co/docs/transformers/en/quantization/bitsandbytes) to only quantize the weights to 4-bits.

->>> # define the model and tokenizer and push the model and tokens to the GPU.
->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")  # doctest: +SKIP
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

->>> # feel free to change the prompt to your liking.
->>> prompt = "If I were an AI that had just achieved"
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa", quantization_config=bnb_config)

->>> # apply the tokenizer.
->>> tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
+input_ids = tokenizer('''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """''', return_tensors="pt").to("cuda")

->>> # use the model to generate new tokens.
->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)  # doctest: +SKIP
-
->>> tokenizer.batch_decode(generated_output)[0]  # doctest: +SKIP
-'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

-### Expected speedups
+## Notes

-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `microsoft/phi-1` checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048.
+- If you're using Transformers < 4.37.0.dev, set `trust_remote_code=True` in [`~AutoModel.from_pretrained`]. Otherwise, make sure you update Transformers to the latest stable version.

-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/phi_1_speedup_plot.jpg">
-</div>
+    ```py
+    import torch
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+    model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/phi-1",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa")
+    
+    input_ids = tokenizer('''def print_prime(n):
+       """
+       Print all primes between 1 and n
+       """''', return_tensors="pt").to("cuda")
+    
+    output = model.generate(**input_ids, cache_implementation="static")
+    print(tokenizer.decode(output[0], skip_special_tokens=True))
+    ```

 ## PhiConfig

 [[autodoc]] PhiConfig

-<frameworkcontent>
-<pt>
-
 ## PhiModel

 [[autodoc]] PhiModel
@ -193,6 +144,3 @@ Below is an expected speedup diagram that compares pure inference time between t

 [[autodoc]] PhiForTokenClassification
    - forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/phi4_multimodal.md
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@ -64,7 +64,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-).to(device, torch.float16)
+).to(device)

 # Generate response
 generate_ids = model.generate(
@ -98,8 +98,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    sample_rate=sample_rate,
-).to(device, torch.float16)
+).to(device)

 generate_ids = model.generate(
    **inputs,
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@ -14,184 +14,116 @@ rendered properly in your Markdown viewer.

 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # SigLIP

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[SigLIP](https://huggingface.co/papers/2303.15343) is a multimodal image-text model similar to [CLIP](clip). It uses separate image and text encoders to generate representations for both modalities.

-## Overview
+Unlike CLIP, SigLIP employs a pairwise sigmoid loss on image-text pairs during training. This training loss eliminates the need for a global view of all pairwise similarities between images and texts within a batch. Consequently, it enables more efficient scaling to larger batch sizes while also delivering superior performance with smaller batch sizes.

-The SigLIP model was proposed in [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. SigLIP proposes to replace the loss function used in [CLIP](clip) by a simple pairwise sigmoid loss. This results in better performance in terms of zero-shot classification accuracy on ImageNet.
+You can find all the original SigLIP checkpoints under the [SigLIP](https://huggingface.co/collections/google/siglip-659d5e62f0ae1a57ae0e83ba) collection.

-The abstract from the paper is the following:

-*We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size from the loss further allows us to study the impact of examples vs pairs and negative to positive ratio. Finally, we push the batch size to the extreme, up to one million, and find that the benefits of growing batch size quickly diminish, with a more reasonable batch size of 32k being sufficient.*
+> [!TIP]
+> Click on the SigLIP models in the right sidebar for more examples of how to apply SigLIP to different image and text tasks.

-## Usage tips
+The example below demonstrates how to generate similarity scores between texts and image(s) with [`Pipeline`] or the [`AutoModel`] class.

- Usage of SigLIP is similar to [CLIP](clip). The main difference is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
- When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` as that's how the model was trained.
- To get the same results as the pipeline, a prompt template of "This is a photo of {label}." should be used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
-alt="drawing" width="600"/>
+```py
+import torch
+from transformers import pipeline

-<small> SigLIP evaluation results compared to CLIP. Taken from the <a href="https://arxiv.org/abs/2303.15343">original paper</a>.</small>
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
-
-## Usage example
-
-There are 2 main ways to use SigLIP: either using the pipeline API, which abstracts away all the complexity for you, or by using the `SiglipModel` class yourself.
-
-### Pipeline API
-
-The pipeline allows to use the model in a few lines of code:
-
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
-
->>> # load pipe
->>> image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
-
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # inference
->>> candidate_labels = ["2 cats", "a plane", "a remote"]
->>> outputs = image_classifier(image, candidate_labels=candidate_labels)
->>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
->>> print(outputs)
-[{'score': 0.1979, 'label': '2 cats'}, {'score': 0.0, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+pipeline = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224", device=0, torch_dtype=torch.bfloat16)
+pipeline(image, candidate_labels=candidate_labels)
 ```

-### Using the model yourself
+</hfoption>
+<hfoption id="AutoModel">

-If you want to do the pre- and postprocessing yourself, here's how to do that:
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+model = AutoModel.from_pretrained("google/siglip-base-patch16-224", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

->>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
->>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to("cuda")

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-## Resources
+</hfoption>
+</hfoptions>

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP.
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
- Demo notebooks for SigLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP). 🌎
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig

+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModel.from_pretrained("google/siglip-base-patch16-224", quantization_config=bnb_config, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

-## Combining SigLIP and Flash Attention 2
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to("cuda")

-First, make sure to install the latest version of Flash Attention 2.
+with torch.no_grad():
+    outputs = model(**inputs)

-```bash
-pip install -U flash-attn --no-build-isolation
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```
+## Notes

-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+- Training is supported for DDP and FSDP on single-node multi-GPU setups. However, it does not use [torch.distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) utilities which may limit the scalability of batch size.
+- When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` because that is how the model was trained.
+- To get the same results as the [`Pipeline`], a prompt template of `"This is a photo of {label}."` should be passed to the processor.
+- Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention.
+    ```py
+    # pip install -U flash-attn --no-build-isolation

-To load and run a model using Flash Attention 2, refer to the snippet below:
+    from transformers import SiglipModel

-```python
->>> import torch
->>> import requests
->>> from PIL import Image
->>> from transformers import SiglipProcessor, SiglipModel
->>> device = "cuda" # the device to load the model onto
-
->>> model = SiglipModel.from_pretrained(
-...     "google/siglip-so400m-patch14-384",
-...     attn_implementation="flash_attention_2",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
->>> processor = SiglipProcessor.from_pretrained("google/siglip-so400m-patch14-384")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
-```
-
-
-## Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-You may set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. Make sure you have `torch>=2.1.1`.
-
-```python
->>> from transformers import SiglipModel
-
->>> model = SiglipModel.from_pretrained(
-...     "google/siglip-so400m-patch14-384",
-...     attn_implementation="sdpa",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-
-## Expected speedups
-
-Below is an expected speedup diagram that compares inference time between the native implementation in transformers using `google/siglip-so400m-patch14-384` checkpoint in `float16` precision and the Flash Attention 2 / SDPA version of the model using different batch sizes.
-
-<div style="text-align: center">
-<img src="https://i.imgur.com/cWm4rsn.png">
-</div>
+    model = SiglipModel.from_pretrained(
+        "google/siglip-so400m-patch14-384",
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.float16,
+        device_map=device,
+    )
+    ```


 ## SiglipConfig
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -244,7 +244,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m

 ### Benchmarks

-FlashAttention2 speeds up inference considerably especially for inputs with long sequences. However, since FlashAttention2 doesn't support computing attention scores with padding tokens, you must manually pad and unpad the attention scores for batched inference if a sequence contains padding tokens. The downside is batched generation is slower with padding tokens. 
+FlashAttention2 speeds up inference considerably especially for inputs with long sequences. However, since FlashAttention2 doesn't support computing attention scores with padding tokens, you must manually pad and unpad the attention scores for batched inference if a sequence contains padding tokens. The downside is batched generation is slower with padding tokens.

 <hfoptions id="padded">
 <hfoption id="short sequence length">
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -111,7 +111,7 @@ This approach optimizes parallel data processing by reducing idle GPU utilizatio

 Data, pipeline and model parallelism combine to form [3D parallelism](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/) to optimize memory and compute efficiency.

-Memory effiiciency is achieved by splitting the model across GPUs and also dividing it into stages to create a pipeline. This allows GPUs to work in parallel on micro-batches of data, reducing the memory usage of the model, optimizer, and activations.
+Memory efficiency is achieved by splitting the model across GPUs and also dividing it into stages to create a pipeline. This allows GPUs to work in parallel on micro-batches of data, reducing the memory usage of the model, optimizer, and activations.

 Compute efficiency is enabled by ZeRO data parallelism where each GPU only stores a slice of the model, optimizer, and activations. This allows higher communication bandwidth between data parallel nodes because communication can occur independently or in parallel with the other pipeline stages.

--- a/docs/source/en/quantization/auto_round.md
+++ b/docs/source/en/quantization/auto_round.md
@ -0,0 +1,286 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# AutoRound
+
+[AutoRound](https://github.com/intel/auto-round) is an advanced quantization algorithm that delivers strong accuracy, even at 2-bit precision. 
+It leverages sign gradient descent to fine-tune both rounding values and min-max clipping thresholds in just 200 steps. Designed for broad compatibility, it seamlessly supports a wide range of LLMs and is actively expanding to cover more VLMs as well. 
+It also supports quantization and inference across multiple hardware platforms, including CPU, XPU, and CUDA.
+
+AutoRound also offers a variety of useful features, including mixed-bit tuning and inference, lm-head quantization, support for exporting to formats like GPTQ/AWQ/GGUF, and flexible tuning recipes. 
+For a comprehensive overview and the latest updates, check out the AutoRound [README](https://github.com/intel/auto-round).
+
+AutoRound was originally developed as part of the [Intel Neural Compressor](https://github.com/intel/neural-compressor), serving as a general-purpose model compression library for deep learning. 
+It has since evolved into a standalone library focused specifically on low-precision optimization for large language models (LLMs). 
+AutoRound remains fully integrated with the Intel Neural Compressor, and you can explore the repository for more details.
+
+
+## Installation
+
+```bash
+pip install auto-round
+```
+
+## Supported Quantization Configurations
+
+AutoRound supports several quantization configurations:
+
+- **Int8 Weight Only**
+- **Int4 Weight Only**
+- **Int3 Weight Only**
+- **Int2 Weight Only**
+- **Mixed bits Weight only**
+
+## Hardware Compatibility
+
+CPU, XPU, and CUDA for both quantization and inference.
+
+## Quantization and Serialization (offline)
+
+Currently, only offline mode is supported to generate quantized models.
+
+<hfoptions id="quantization">
+<hfoption id="quantization cmd">
+
+### Command Line Usage
+```bash
+auto-round \
+    --model facebook/opt-125m \
+    --bits 4 \
+    --group_size 128 \
+    --output_dir ./tmp_autoround
+```
+
+AutoRound also offer another two recipes, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. 
+For 2 bits, we recommend using `auto-round-best` or `auto-round`.
+</hfoption>
+
+<hfoption id="quantization auto-round api">
+
+### AutoRound API Usage
+This setting offers a better trade-off between accuracy and tuning cost, and is recommended in all scenarios.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+# mixed bits config
+# layer_config = {"model.decoder.layers.6.self_attn.out_proj": {"bits": 2, "group_size": 32}}
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    # enable_torch_compile=True,
+    # layer_config=layer_config,
+)
+
+output_dir = "./tmp_autoround"
+# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+
+</hfoption>
+
+<hfoption id="quantization auto-round-best">
+
+### AutoRoundBest recipe
+This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available.
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    nsamples=512,
+    iters=1000,
+    low_gpu_mem_usage=True
+)
+
+output_dir = "./tmp_autoround"
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+</hfoption>
+
+<hfoption id="quantization auto-round-light">
+
+### AutoRoundLight recipe
+This setting offers the best speed (2 - 3X faster than AutoRound), but it may cause a significant accuracy drop for small models and 2-bit quantization. It is recommended for 4-bit settings and models larger than 3B.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    iters=50,
+    lr=5e-3,
+)
+
+output_dir = "./tmp_autoround"
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+
+</hfoption>
+
+</hfoptions>
+
+W4G128 Average Accuracy of 13 tasks (mmlu-pro, if_eval, gsm8k, etc) and Time Cost Results (Testing was conducted on the Nvidia A100 80G using the version of PyTorch 2.6.0 with enable_torch_compile):
+
+| Model   | Qwen2.5-0.5B-Instruct | Falcon3-3B    | Qwen2.5-7B-Instruct | Meta-Llama-3.1-8B-Instruct | Falcon3-10B   | Qwen2.5-72B-Instruct |
+|---------|--------------------|---------------|------------------|----------------------------|---------------|-------------------|
+| 16bits  | 0.4192             | 0.5203        | 0.6470           | 0.6212                     | 0.6151        | 0.7229            |
+| Best    | **0.4137**(7m)     | **0.5142**(23m) | 0.6426(58m)      | **0.6116**(65m)            | **0.6092**(81m) | 0.7242(575m)      |
+| Default | 0.4129(2m)         | 0.5133(6m)    | 0.6441(13m)      | 0.6106(13m)                | 0.6080(18m)   | **0.7252**(118m)  |
+| Light   | 0.4052(2m)         | 0.5108(3m)    | **0.6453**(5m)   | 0.6104(6m)                 | 0.6063(6m)    | 0.7243(37m)       |
+
+## Inference
+
+AutoRound automatically selects the best available backend based on the installed libraries and prompts the user to install additional libraries when a better backend is found.
+<hfoptions id="inference">
+<hfoption id="inference cpu">
+
+### CPU
+
+Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX) for 4 bits inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+<hfoption>
+
+<hfoption id="inference xpu">
+
+### XPU
+
+Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+<hfoption>
+
+<hfoption id="inference cuda">
+
+### CUDA
+
+Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+<hfoption>
+
+<hfoption id="inference backend">
+
+### Specify Inference Backend
+
+AutoRound automatically selects the backend for each layer based on compatibility. In general, the priority order is Marlin > ExLLaMAV2 > Triton, but the final choice depends on factors such as group size, bit width, packing format, hardware device, and other implementation details. For more details, please refer to [backends](https://github.com/intel/auto-round?tab=readme-ov-file#specify-backend),
+
+The backend may not always be the most suitable for certain devices. 
+You can specify your preferred backend such as "ipex" for CPU and CPU, "marlin/exllamav2/triton" for CUDA, according to your needs or hardware compatibility. Please note that additional corresponding libraries may be required.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+quantization_config = AutoRoundConfig(backend="ipex")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+<hfoption>
+
+
+<hfoption id="format convert">
+
+### Convert GPTQ/AWQ to AutoRound
+
+Most GPTQ/AWQ models can be converted to the AutoRound format for better compatibility and support with Intel devices. Please note that the quantization config will be changed if the model is serialized.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
+
+model_name = "ybelkada/opt-125m-gptq-4bit"
+quantization_config = AutoRoundConfig()
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+<hfoption>
+
+<hfoptions>
+
+## Issues
+
+If you encounter any issues with the transformers integration, please open an issue on
+the [transformers](https://github.com/huggingface/transformers/issues) repository.  
+If you encounter any issues with auto-round, please open an issue on
+the [AutoRound](https://github.com/intel/auto-round/issues) repository.
+
+
+## Acknowledgement
+Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
+
+## Contribution
+Contributions to [AutoRound](https://github.com/intel/auto-round/pulls) are welcome and greatly appreciated!
+Whether it's fixing bugs, improving documentation, adding new features, or suggesting improvements, your help is always valued.
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@ -14,13 +14,21 @@ rendered properly in your Markdown viewer.

 -->

-# bitsandbytes
+# Bitsandbytes

-[bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) features the LLM.int8 and QLoRA quantization to enable accessible large language model inference and training.
+The [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) library provides quantization tools for LLMs through a lightweight Python wrapper around CUDA functions. It enables working with large models using limited computational resources by reducing their memory footprint.

-[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that aims to make large language model inference more accessible without significant degradation. Unlike naive 8-bit quantization, which can result in loss of critical information and accuracy, LLM.int8() dynamically adapts to ensure sensitive components of the computation retain higher precision when needed.
+At its core, bitsandbytes provides:

-QLoRA, or 4-bit quantization, compresses a model even further to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allowing training. 
+- **Quantized Linear Layers**: `Linear8bitLt` and `Linear4bit` layers that replace standard PyTorch linear layers with memory-efficient quantized alternatives
+- **Optimized Optimizers**: 8-bit versions of common optimizers through its `optim` module, enabling training of large models with reduced memory requirements
+- **Matrix Multiplication**: Optimized matrix multiplication operations that leverage the quantized format
+
+bitsandbytes offers two main quantization features:
+
+1. **LLM.int8()** - An 8-bit quantization method that makes inference more accessible without significant performance degradation. Unlike naive quantization, [LLM.int8()](https://hf.co/papers/2208.07339) dynamically preserves higher precision for critical computations, preventing information loss in sensitive parts of the model.
+
+2. **QLoRA** - A 4-bit quantization technique that compresses models even further while maintaining trainability by inserting a small set of trainable low-rank adaptation (LoRA) weights.

 > **Note:** For a user-friendly quantization experience, you can use the `bitsandbytes` [community space](https://huggingface.co/spaces/bnb-community/bnb-my-repo).

@ -30,12 +38,38 @@ Run the command below to install bitsandbytes.
 ```bash
 pip install --upgrade transformers accelerate bitsandbytes
 ```
+To compile from source, follow the instructions in the [bitsandbytes installation guide](https://huggingface.co/docs/bitsandbytes/main/en/installation).
+
+## Hardware Compatibility
+bitsandbytes is currently only supported on CUDA GPUs for CUDA versions 11.0 - 12.8. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out the [bitsandbytes repository](https://github.com/bitsandbytes-foundation/bitsandbytes) for more information.
+
+### CUDA
+
+| Feature | Minimum Hardware Requirement |
+|---------|-------------------------------|
+| 8-bit optimizers | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or newer GPUs |
+| NF4/FP4 quantization | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+
+### Multi-backend
+
+| Backend | Supported Versions | Python versions | Architecture Support | Status |
+|---------|-------------------|----------------|---------------------|---------|
+| AMD ROCm | 6.1+ | 3.10+ | minimum CDNA - gfx90a, RDNA - gfx1100 | Alpha |
+| Apple Silicon (MPS) | WIP | 3.10+ | M1/M2 chips | Planned |
+| Intel CPU | v2.4.0+ (ipex) | 3.10+ | Intel CPU | Alpha |
+| Intel GPU | v2.4.0+ (ipex) | 3.10+ | Intel GPU | Experimental |
+| Ascend NPU | 2.1.0+ (torch_npu) | 3.10+ | Ascend NPU | Experimental |
+
+> **Note:** Bitsandbytes is moving away from the multi-backend approach towards using [Pytorch Custom Operators](https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html), as the main mechanism for supporting new hardware, and dispatching to the correct backend.
+
+## Quantization Examples

 Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate](https://huggingface.co/docs/accelerate/index) and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers.

 <hfoptions id="bnb">
 <hfoption id="8-bit">
-
+<div class="bnb-container" style="border: 1px solid #ddd; border-radius: 8px; padding: 20px; margin: 20px 0">
 Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.

 ```py
@ -45,6 +79,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model_8bit = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b7", 
+    device_map="auto",
    quantization_config=quantization_config
 )
 ```
@ -59,6 +94,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model_8bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m", 
+    device_map="auto",
    quantization_config=quantization_config, 
    torch_dtype="auto"
 )
@ -74,16 +110,16 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m", 
+    device_map="auto",
    quantization_config=quantization_config
 )
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

 model.push_to_hub("bloom-560m-8bit")
 ```
-
+</div>
 </hfoption>
 <hfoption id="4-bit">
-
+<div class="bnb-container" style="border: 1px solid #ddd; border-radius: 8px; padding: 20px; margin: 20px 0">
 Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.

 ```py
@ -93,6 +129,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model_4bit = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b7",
+    device_map="auto",
    quantization_config=quantization_config
 )
 ```
@ -107,6 +144,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model_4bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m",
+    device_map="auto",
    quantization_config=quantization_config, 
    torch_dtype="auto"
 )
@ -115,6 +153,20 @@ model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype

 Make sure you have the latest bitsandbytes version so you can serialize 4-bit models and push them to the Hub with [`~PreTrainedModel.push_to_hub`]. Use [`~PreTrainedModel.save_pretrained`] to save the 4-bit model locally.  

+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-560m", 
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+model.push_to_hub("bloom-560m-4bit")
+```
+</div>
 </hfoption>
 </hfoptions>

--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -22,25 +22,26 @@ Transformers supports many quantization methods, each with their pros and cons,

 Use the Space below to help you pick a quantization method depending on your hardware and number of bits to quantize to.

-| Quantization Method                           | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits          | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
-|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)             | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🔴 | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| [GGUF / GGML (llama.cpp)](../gguf)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
-| [GPTQModel](./gptq)                        | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
-| [AutoGPTQ](./gptq)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HIGGS](./higgs)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
-| [HQQ](./hqq)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao)                       | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
-| [VPTQ](./vptq)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
-| [FINEGRAINED_FP8](./finegrained_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |
-| [SpQR](./spqr)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
-| [Quark](./quark)                           | 🔴                       | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |
+| Quantization Method                       | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits         | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
+|-------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|--------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                            | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2          | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AutoRound](./auto_round)                 | 🔴                   | 🟢               | 🟢          |   🔴        |   🔴                                |   🟢              |   🔴               | 2/3/4/8      |    🔴              |       🟢                      |    🟢                       |      https://github.com/intel/auto-round                                       |
+| [AWQ](./awq)                              | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4            | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🔴 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
+| [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
+| [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4          | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
+| [HQQ](./hqq)                              | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8        | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8)                | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao)                      | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8          |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq)                            | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
+| [FINEGRAINED_FP8](./finegrained_fp8)      | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      |        |
+| [SpQR](./spqr)                            | 🔴                     |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3            |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
+| [Quark](./quark)                          | 🔴                     | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |

 ## Resources

--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -33,10 +33,11 @@ See the table below for additional torchao features.

 torchao supports the [quantization techniques](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md) below.

- A16W8 Int8 WeightOnly Quantization
- A16W4 WeightOnly Quantization
- A8W8 Int8 Dynamic Quantization
+- A16W8 Float8 Dynamic Quantization
 - A16W8 Float8 WeightOnly Quantization
+- A8W8 Int8 Dynamic Quantization
+- A16W8 Int8 Weight Only Quantization
+- A16W4 Int4 Weight Only Quantization
 - Autoquantization


@ -44,7 +45,7 @@ Check the table below to see if your hardware is compatible.

 | Component | Compatibility |
 |----------|----------------|
-| CUDA Versions | ✅ cu118, cu124, cu126, cu128 |
+| CUDA Versions | ✅ cu118, cu126, cu128 |
 | CPU | ✅ change `device_map="cpu"` (see examples below) |


@ -56,14 +57,14 @@ Install torchao from PyPi or the PyTorch index with the following commands.

 ```bash
 # Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation
-# Stable release from Pypi which will default to CUDA 12.4
+# Stable release from Pypi which will default to CUDA 12.6
 pip install --upgrade torchao transformers
 ```
 </hfoption> 
 <hfoption id="PyTorch Index">
 Stable Release from the PyTorch index
 ```bash
-pip install torchao --extra-index-url https://download.pytorch.org/whl/cu124 # options are cpu/cu118/cu124/cu126
+pip install torchao --index-url https://download.pytorch.org/whl/cu126 # options are cpu/cu118/cu126/cu128
 ```
 </hfoption>
 </hfoptions>
@ -80,15 +81,79 @@ You can manually choose the quantization types and settings or automatically sel

 Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of the weights to quantize (for int8 weight only and int4 weight only). Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method.

-<hfoptions id="examples">
-<hfoption id="int8-weight-only cuda">
+We'll show examples for recommended quantization methods based on hardwares, e.g. A100 GPU, H100 GPU, CPU.

+### H100 GPU
+<hfoptions id="examples-H100-GPU">
+<hfoption id="float8-dynamic-and-weight-only">
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+
+quant_config = Float8DynamicActivationFloat8WeightConfig()
+# or float8 weight only quantization
+# quant_config = Float8WeightOnlyConfig()
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="int4-weight-only">
+
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import GemliteUIntXWeightOnlyConfig
+
+# We integrated with gemlite, which optimizes for batch size N on A100 and H100
+quant_config = GemliteUIntXWeightOnlyConfig(group_size=128)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+### A100 GPU
+<hfoptions id="examples-A100-GPU">
+<hfoption id="int8-dynamic-and-weight-only">
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int8WeightOnlyConfig

-quant_config = Int8WeightOnlyConfig(group_size=128)
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# or int8 weight only quantization
+# quant_config = Int8WeightOnlyConfig()
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -109,14 +174,52 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>

-<hfoption id="int8-weight-only cpu">
+<hfoption id="int4-weight-only">

+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
+
+# For batch size N, we recommend gemlite, which may require autotuning
+# default is 4 bit, 8 bit is also supported by passing `bit_width=8`
+quant_config = GemliteUIntXWeightOnlyConfig(group_size=128)
+
+# For batch size 1, we also have custom tinygemm kernel that's only optimized for this
+# We can set `use_hqq` to `True` for better accuracy
+# quant_config = Int4WeightOnlyConfig(group_size=128, use_hqq=True)
+
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+### CPU
+<hfoptions id="examples-CPU">
+<hfoption id="int8-dynamic-and-weight-only">
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int8WeightOnlyConfig

-quant_config = Int8WeightOnlyConfig(group_size=128)
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# quant_config = Int8WeightOnlyConfig()
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -136,35 +239,7 @@ output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implemen
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="int4-weight-only cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int4WeightOnlyConfig
-
-quant_config = Int4WeightOnlyConfig(group_size=128)
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-
-<hfoption id="int4-weight-only cpu">
+<hfoption id="int4-weight-only">

 > [!TIP]
 > Run the quantized model on a CPU by changing `device_map` to `"cpu"` and `layout` to `Int4CPULayout()`.
@ -195,116 +270,6 @@ output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implemen
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="int8-dynamic-quantization cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig
-
-quant_config = Int8DynamicActivationInt8WeightConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="int8-dynamic-quantization cpu">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig
-
-quant_config = Int8DynamicActivationInt8WeightConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="cpu",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="float8-weight-only cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8WeightOnlyConfig
-
-quant_config = Float8WeightOnlyConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-
-```
-</hfoption>
-<hfoption id="float8-weight-only cpu">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8WeightOnlyConfig
-
-quant_config = Float8WeightOnlyConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="cpu",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-
 </hfoptions>

 ### Autoquant
@ -313,6 +278,8 @@ If you want to automatically choose a quantization type for quantizable layers (

 The `autoquant` API automatically chooses a quantization type by micro-benchmarking on input type and shape and compiling a single linear layer.

+Note: autoquant is for GPU only right now.
+
 Create a [`TorchAoConfig`] and set to `"autoquant"`. Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method. Finally, call `finalize_autoquant` on the quantized model to finalize the quantization and log the input shapes.


@ -346,11 +313,25 @@ torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/not

 To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals).

+<hfoptions id="serialization-examples">
+<hfoption id="save-locally">
 ```py
 # don't serialize model with Safetensors
 output_dir = "llama3-8b-int4wo-128"
 quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False)
 ```
+</hfoption>
+<hfoption id="push-to-huggingface-hub">
+```py
+# don't serialize model with Safetensors
+USER_ID = "your_huggingface_user_id"
+REPO_ID = "llama3-8b-int4wo-128"
+quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128", safe_serialization=False)
+tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
+```
+</hfoption>
+</hfoptions>
+

 ## Loading quantized models

@ -486,4 +467,4 @@ Refer to [Other Available Quantization Techniques](https://github.com/pytorch/ao

 ## Issues

-If you encounter any issues with the Transformers integration, please open an issue on the [Transformers](https://github.com/huggingface/transformers/issues) repository. For issues directly related to torchao, please open an issue on the [torchao](https://github.com/pytorch/ao/issues) repository.
+If you encounter any issues with the Transformers integration, please open an issue on the [Transformers](https://github.com/huggingface/transformers/issues) repository. For issues directly related to torchao, please open an issue on the [torchao](https://github.com/pytorch/ao/issues) repository.
--- a/docs/source/ja/internal/modeling_utils.md
+++ b/docs/source/ja/internal/modeling_utils.md
@ -25,23 +25,6 @@ rendered properly in your Markdown viewer.

 [[autodoc]] pytorch_utils.Conv1D

-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
 ## PyTorch Helper Functions

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -720,6 +720,8 @@
        title: Qwen2VL
      - local: in_translation
        title: (번역중) Segment Anything
+      - local: model_doc/siglip
+        title: SigLIP
      - local: in_translation
        title: (번역중) Speech Encoder Decoder Models
      - local: in_translation
--- a/docs/source/ko/internal/modeling_utils.md
+++ b/docs/source/ko/internal/modeling_utils.md
@ -25,23 +25,6 @@ rendered properly in your Markdown viewer.

 [[autodoc]] pytorch_utils.Conv1D

-[[autodoc]] modeling_utils.PoolerStartLogits
-   - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-   - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-   - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-   - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-   - forward
-
 ## PyTorch 헬퍼(helper) 함수 [[transformers.apply_chunking_to_forward]]

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/docs/source/ko/model_doc/siglip.md
+++ b/docs/source/ko/model_doc/siglip.md
@ -0,0 +1,253 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SigLIP[[siglip]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## 개요[[overview]]
+
+SigLIP 모델은 Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer의 [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) 논문에서 제안되었습니다. SigLIP은 [CLIP](clip)에서 사용된 손실 함수를 간단한 쌍별 시그모이드 손실(pairwise sigmoid loss)로 대체할 것을 제안합니다. 이는 ImageNet에서 제로샷 분류 정확도 측면에서 더 나은 성능을 보입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*우리는 언어-이미지 사전 학습(Language-Image Pre-training, SigLIP)을 위한 간단한 쌍별 시그모이드 손실을 제안합니다. 소프트맥스 정규화를 사용하는 표준 대조 학습과 달리, 시그모이드 손실은 이미지-텍스트 쌍에만 작용하며 정규화를 위해 쌍별 유사성의 전역적 관점을 필요로 하지 않습니다. 시그모이드 손실은 배치 크기를 더욱 확장할 수 있게 하는 동시에 작은 배치 크기에서도 더 나은 성능을 보입니다. Locked-image Tuning과 결합하여, 단 4개의 TPUv4 칩만으로 이틀 만에 84.5%의 ImageNet 제로샷 정확도를 달성하는 SigLiT 모델을 학습했습니다. 손실 함수에서 배치 크기를 분리함으로써 예제 대 쌍의 영향과 Negative 대 Positive 비율을 연구할 수 있게 되었습니다. 마지막으로, 우리는 배치 크기를 100만 개까지 극단적으로 늘려보았고, 배치 크기 증가의 이점이 빠르게 감소하며 32k의 더 합리적인 배치 크기로도 충분하다는 것을 발견했습니다.*
+
+## 사용 팁[[usage-tips]]
+
+- SigLIP의 사용법은 [CLIP](clip)과 유사합니다. 주요 차이점은 학습 손실 함수로, 배치 내 모든 이미지와 텍스트 간의 쌍별 유사성에 대한 전역적 관점이 필요하지 않습니다. 소프트맥스 대신 로짓에 시그모이드 활성화 함수를 적용해야 합니다.
+- 학습은 지원되지만 `torch.distributed` 유틸리티를 사용하지 않아 배치 크기의 확장성이 제한될 수 있습니다. 그러나 단일 노드 다중 GPU 설정에서는 DDP와 FDSP가 작동합니다.
+- 독립형 [`SiglipTokenizer`] 또는 [`SiglipProcessor`]를 사용할 때는 모델이 그렇게 학습되었으므로 `padding="max_length"`를 전달해야 합니다.
+- 파이프라인과 동일한 결과를 얻으려면 "This is a photo of {label}."의 프롬프트 템플릿을 사용해야 합니다.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
+alt="drawing" width="600"/>
+
+<small> CLIP과 비교한 SigLIP 평가 결과. <a href="https://arxiv.org/abs/2303.15343">원본 논문</a>에서 발췌.</small>
+
+이 모델은 [nielsr](https://huggingface.co/nielsr)가 기여했습니다.
+원본 코드는 [여기](https://github.com/google-research/big_vision/tree/main)에서 찾을 수 있습니다.
+
+## 사용 예시[[usage-example]]
+
+SigLIP을 사용하는 방법에는 두 가지 주요 방법이 있습니다: 모든 복잡성을 추상화하는 파이프라인 API를 사용하거나, 직접 `SiglipModel` 클래스를 사용하는 방법입니다.
+
+### 파이프라인 API[[pipeline-API]]
+
+파이프라인을 사용하면 몇 줄의 코드로 모델을 사용할 수 있습니다:
+
+```python
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> # 파이프라인 로드
+>>> image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
+
+>>> # 이미지 로드
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> # 추론
+>>> candidate_labels = ["2 cats", "a plane", "a remote"]
+>>> outputs = image_classifier(image, candidate_labels=candidate_labels)
+>>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
+>>> print(outputs)
+[{'score': 0.1979, 'label': '2 cats'}, {'score': 0.0, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+```
+
+### 직접 모델 사용하기[[using-the-model-yourself]]
+
+전처리와 후처리를 직접 수행하려면 다음과 같이 하면 됩니다:
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import AutoProcessor, AutoModel
+>>> import torch
+
+>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# 파이프라인 프롬프트 템플릿을 따라 동일한 결과를 얻습니다
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
+# 중요: 모델이 이렇게 학습되었으므로 `padding=max_length`를 전달합니다
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # 시그모이드 활성화 함수를 적용한 확률입니다
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+19.8% that image 0 is '2 cats'
+```
+
+## 리소스[[resources]]
+
+SigLIP을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티(🌎로 표시) 리소스 목록입니다.
+
+- [제로샷 이미지 분류 작업 가이드](../tasks/zero_shot_image_classification)
+- SigLIP에 대한 데모 노트북은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP)에서 찾을 수 있습니다. 🌎
+
+여기에 포함될 리소스를 제출하는 데 관심이 있으시면 Pull Request를 열어주시면 검토하겠습니다! 리소스는 이상적으로 기존 리소스를 복제하는 대신 새로운 것을 보여주어야 합니다.
+
+
+## SigLIP과 Flash Attention 2 결합하기[[combining-siglip-with-flash-attention-2]]
+
+먼저 Flash Attention 2의 최신 버전을 설치해야 합니다.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+또한 Flash-Attention 2와 호환되는 하드웨어가 있는지 확인하세요. flash-attn 저장소의 공식 문서에서 자세히 알아보세요. 또한 모델을 반정밀도(예: `torch.float16`)로 로드해야 합니다.
+
+Flash Attention 2를 사용하여 모델을 로드하고 실행하려면 아래 코드를 참조하세요:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import SiglipProcessor, SiglipModel
+>>> device = "cuda" # 모델을 로드할 장치
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="flash_attention_2",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+>>> processor = SiglipProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# 파이프라인 프롬프트 템플릿을 따라 동일한 결과를 얻습니다
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
+# 중요: 모델이 이렇게 학습되었으므로 `padding=max_length`를 전달합니다
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # 시그모이드 활성화 함수를 적용한 확률입니다
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+19.8% that image 0 is '2 cats'
+```
+
+
+## Scaled Dot Product Attention(SDPA) 사용하기[using-scaled-dot-product-attention(SDPA)]]
+
+PyTorch는 `torch.nn.functional`의 일부로 스케일된 점곱 어텐션(SDPA) 연산자를 포함합니다. 이 함수는 
+입력과 사용 중인 하드웨어에 따라 적용할 수 있는 여러 구현을 포함합니다. 자세한 내용은 
+[공식 문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+또는 [GPU 추론](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) 
+페이지를 참조하세요.
+
+`from_pretrained()`에서 `attn_implementation="sdpa"`를 설정하여 SDPA를 명시적으로 요청할 수 있습니다. `torch>=2.1.1`이 설치되어 있는지 확인하세요.
+
+```python
+>>> from transformers import SiglipModel
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="sdpa",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+```
+
+최상의 속도 향상을 위해 모델을 반정밀도(예: `torch.float16` 또는 `torch.bfloat16`)로 로드하는 것이 좋습니다.
+
+
+## 예상 속도 향상[[expected-speedups]]
+
+아래는 `google/siglip-so400m-patch14-384` 체크포인트를 `float16` 정밀도로 사용하는 transformers의 네이티브 구현과 Flash Attention 2 / SDPA 버전의 모델을 다양한 배치 크기로 비교한 추론 시간의 예상 속도 향상 다이어그램입니다.
+
+<div style="text-align: center">
+<img src="https://i.imgur.com/cWm4rsn.png">
+</div>
+
+
+## SiglipConfig
+
+[[autodoc]] SiglipConfig
+    - from_text_vision_configs
+
+## SiglipTextConfig
+
+[[autodoc]] SiglipTextConfig
+
+## SiglipVisionConfig
+
+[[autodoc]] SiglipVisionConfig
+
+## SiglipTokenizer
+
+[[autodoc]] SiglipTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SiglipImageProcessor
+
+[[autodoc]] SiglipImageProcessor
+    - preprocess
+
+## SiglipImageProcessorFast
+
+[[autodoc]] SiglipImageProcessorFast
+    - preprocess
+
+## SiglipProcessor
+
+[[autodoc]] SiglipProcessor
+
+## SiglipModel
+
+[[autodoc]] SiglipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## SiglipTextModel
+
+[[autodoc]] SiglipTextModel
+    - forward
+
+## SiglipVisionModel
+
+[[autodoc]] SiglipVisionModel
+    - forward
+
+
+## SiglipForImageClassification
+
+[[autodoc]] SiglipForImageClassification
+    - forward 
--- a/docs/source/zh/internal/modeling_utils.md
+++ b/docs/source/zh/internal/modeling_utils.md
@ -25,23 +25,6 @@ rendered properly in your Markdown viewer.

 [[autodoc]] pytorch_utils.Conv1D

-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
 ## PyTorch帮助函数

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -617,9 +617,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -640,6 +637,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump(all_results, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -778,9 +778,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -798,6 +795,9 @@ def main():
                    token=args.hub_token,
                )

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -714,9 +714,6 @@ def main():

    logger.info(f"Test metrics: {metrics}")

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -739,6 +736,9 @@ def main():
                    ignore_patterns=["epoch_*"],
                )

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -697,9 +697,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -719,6 +716,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump({"perplexity": perplexity}, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -891,9 +891,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -908,6 +905,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump({"perplexity": perplexity}, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -735,9 +735,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -757,6 +754,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump({"perplexity": perplexity}, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -622,9 +622,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -645,6 +642,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump(all_results, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -759,9 +759,6 @@ def main():

    logger.info(f"Test metrics: {metrics}")

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -784,6 +781,9 @@ def main():
                    ignore_patterns=["epoch_*"],
                )

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -602,9 +602,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -628,6 +625,9 @@ def main():
            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                json.dump(all_results, f, indent=2)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -19,6 +19,7 @@ import logging
 import os
 import random
 import sys
+from collections import Counter
 from dataclasses import dataclass, field
 from typing import Optional

@ -467,6 +468,14 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
+
+    def print_class_distribution(dataset, split_name):
+        label_counts = Counter(dataset["label"])
+        total = sum(label_counts.values())
+        logger.info(f"Class distribution in {split_name} set:")
+        for label, count in label_counts.items():
+            logger.info(f"  Label {label}: {count} ({count / total:.2%})")
+
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
@ -474,6 +483,7 @@ def main():
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
+        print_class_distribution(train_dataset, "train")

    if training_args.do_eval:
        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
@ -482,6 +492,7 @@ def main():
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
+        print_class_distribution(eval_dataset, "validation")

    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
@ -490,6 +501,7 @@ def main():
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))
+        print_class_distribution(predict_dataset, "test")

    # Log a few random samples from the training set:
    if training_args.do_train:
@ -508,8 +520,12 @@ def main():
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        labels = p.label_ids
+        if not training_args.eval_do_concat_batches:
+            preds = np.concatenate(preds, axis=0)
+            labels = np.concatenate(p.label_ids, axis=0)
        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=p.label_ids)
+        result = metric.compute(predictions=preds, references=labels)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -634,9 +634,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -679,6 +676,9 @@ def main():
        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump(all_results, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@ -38,8 +38,6 @@ from transformers import (
    OpenAIGPTLMHeadModel,
    OpenAIGPTTokenizer,
    OPTForCausalLM,
-    TransfoXLLMHeadModel,
-    TransfoXLTokenizer,
    XLMTokenizer,
    XLMWithLMHeadModel,
    XLNetLMHeadModel,
@ -62,7 +60,6 @@ MODEL_CLASSES = {
    "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
-    "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
    "xlm": (XLMWithLMHeadModel, XLMTokenizer),
    "gptj": (GPTJForCausalLM, AutoTokenizer),
    "bloom": (BloomForCausalLM, BloomTokenizerFast),
@ -368,10 +365,7 @@ def main():
        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
        preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)

-        if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
-            tokenizer_kwargs = {"add_space_before_punct_symbol": True}
-        else:
-            tokenizer_kwargs = {}
+        tokenizer_kwargs = {}

        encoded_prompt = tokenizer.encode(
            preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -529,6 +529,9 @@ def main():

    def compute_metrics(p):
        predictions, labels = p
+        if not training_args.eval_do_concat_batches:
+            predictions = np.hstack(predictions)
+            labels = np.hstack(labels)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -794,9 +794,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -826,6 +823,9 @@ def main():
                        all_results[key] = int(value)
                json.dump(all_results, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -762,9 +762,6 @@ def main():
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

-    if args.with_tracking:
-        accelerator.end_training()
-
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
@ -784,6 +781,9 @@ def main():
        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"eval_bleu": eval_metric["score"]}, f)

+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+

 if __name__ == "__main__":
    main()
--- a/model_cards/README.md
+++ b/model_cards/README.md
@ -1,22 +0,0 @@
-## 🔥 Model cards now live inside each huggingface.co model repo 🔥
-
-
-For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub.
-
-### How to update a model card
-
-You can directly update a model card inside any model repo you have **write access** to, i.e.:
- a model under your username namespace
- a model under any organization you are a part of.
-
-You can either:
- update it, commit and push using your usual git workflow (command line, GUI, etc.)
- or edit it directly from the website's UI.
-
-**What if you want to create or update a model card for a model you don't have write access to?**
-
-In that case, you can open a [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)! Check out the [announcement](https://huggingface.co/blog/community-update) of this feature for more details 🤗.
-
-### What happened to the model cards here?
-
-We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub.
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@ -1,448 +0,0 @@
-#!/usr/bin/env python
-
-# HF Trainer benchmarking tool
-#
-# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
-#
-# It then prints a report once in github format with all the information that needs to be shared
-# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
-#
-# The main idea is:
-#
-#     ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
-#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
-#     --target-metric-key train_samples_per_second
-#
-# The variations can be any command line argument that you want to compare and not just dtype as in
-# the example.
-#
-# --variations allows you to compare variations in multiple dimensions.
-#
-# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
-# times adding one of:
-#
-#    1. --tf32 0 --fp16 0
-#    2. --tf32 0 --fp16 1
-#    3. --tf32 0 --bf16 1
-#    4. --tf32 1 --fp16 0
-#    5. --tf32 1 --fp16 1
-#    6. --tf32 1 --bf16 1
-#
-# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
-#
-# If you want to rely on defaults, this:
-#    --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
-# is identical to this:
-#    --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
-#
-# the leading empty variation in the 2nd dimension is a valid variation.
-#
-# So here we get the following 6 variations:
-#
-#    1. --tf32 0
-#    2. --tf32 0 --fp16
-#    3. --tf32 0 --bf16
-#    4. --tf32 1
-#    5. --tf32 1 --fp16
-#    6. --tf32 1 --bf16
-#
-# In this particular case we don't know what the default tf32 setting is as it's normally
-# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
-#    `--tf32 0|--tf32 1`
-#
-# Here is a full example of a train:
-#
-# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
-# --base-cmd \
-# ' examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small \
-# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
-# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
-# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
-# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
-# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
-# --max_train_samples 20000 --dataloader_num_workers 2 ' \
-# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
-# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
-# --repeat-times 1 --base-variation '--tf32 0'
-#
-# and here is a possible output:
-#
-#
-# | Variation       |     Train |   Diff |   Train |
-# |                 |   samples |      % |    loss |
-# |                 |       per |        |         |
-# |                 |    second |        |         |
-# |:----------------|----------:|-------:|--------:|
-# | --tf32 0        |    285.11 |      0 |    2.51 |
-# | --tf32 1        |    342.09 |     20 |    2.51 |
-# | --fp16 --tf32 0 |    423.49 |     49 |    2.51 |
-# | --fp16 --tf32 1 |    423.13 |     48 |    2.51 |
-# | --bf16 --tf32 0 |    416.80 |     46 |    2.52 |
-# | --bf16 --tf32 1 |    415.87 |     46 |    2.52 |
-#
-#
-# So you can quickly compare the different outcomes.
-#
-# Typically running each experiment once is enough, but if the environment is unstable you can
-# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
-#
-# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
-# it as can be seen from the table above, but you can also specify which combination is the one to use as
-# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
-#
-# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
-# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
-#    --target-metric-key eval_samples_per_second
-# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
-# well (as currently it doesn't)
-#
-
-import argparse
-import datetime
-import io
-import itertools
-import json
-import math
-import os
-import platform
-import re
-import shlex
-import subprocess
-import sys
-from pathlib import Path
-from statistics import fmean
-
-import pandas as pd
-import torch
-from tqdm import tqdm
-
-import transformers
-
-
-nan = float("nan")
-
-
-class Tee:
-    """
-    A helper class to tee print's output into a file.
-    Usage:
-    sys.stdout = Tee(filename)
-    """
-
-    def __init__(self, filename):
-        self.stdout = sys.stdout
-        self.file = open(filename, "a")
-
-    def __getattr__(self, attr):
-        return getattr(self.stdout, attr)
-
-    def write(self, msg):
-        self.stdout.write(msg)
-        # strip tqdm codes
-        self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
-
-
-def get_original_command(max_width=80, full_python_path=False):
-    """
-    Return the original command line string that can be replayed nicely and wrapped for 80 char width.
-
-    Args:
-        max_width (`int`, *optional*, defaults to 80):
-            The width to wrap for.
-        full_python_path (`bool`, `optional`, defaults to `False`):
-             Whether to replicate the full path or just the last segment (i.e. `python`).
-    """
-
-    cmd = []
-
-    # deal with critical env vars
-    env_keys = ["CUDA_VISIBLE_DEVICES"]
-    for key in env_keys:
-        val = os.environ.get(key, None)
-        if val is not None:
-            cmd.append(f"{key}={val}")
-
-    # python executable (not always needed if the script is executable)
-    python = sys.executable if full_python_path else sys.executable.split("/")[-1]
-    cmd.append(python)
-
-    # now the normal args
-    cmd += list(map(shlex.quote, sys.argv))
-
-    # split up into up to MAX_WIDTH lines with shell multi-line escapes
-    lines = []
-    current_line = ""
-    while len(cmd) > 0:
-        current_line += f"{cmd.pop(0)} "
-        if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
-            lines.append(current_line)
-            current_line = ""
-    return "\\\n".join(lines)
-
-
-def get_base_command(args, output_dir):
-
-    # unwrap multi-line input
-    args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
-
-    # remove --output_dir if any and set our own
-    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
-    args.base_cmd += f" --output_dir {output_dir}"
-
-    # ensure we have --overwrite_output_dir
-    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
-    args.base_cmd += " --overwrite_output_dir"
-
-    return [sys.executable] + shlex.split(args.base_cmd)
-
-
-def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
-
-    # Enable to debug everything but the run itself, to do it fast and see the progress.
-    # This is useful for debugging the output formatting quickly - we can remove it later once
-    # everybody is happy with the output
-    if 0:
-        import random
-        from time import sleep
-
-        sleep(0)
-        return dict(
-            {k: random.uniform(0, 100) for k in metric_keys},
-            **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
-        )
-
-    result = subprocess.run(cmd, capture_output=True, text=True)
-
-    if verbose:
-        print("STDOUT", result.stdout)
-        print("STDERR", result.stderr)
-
-    # save the streams
-    prefix = variation.replace(" ", "-")
-    with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
-        f.write(result.stdout)
-    with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
-        f.write(result.stderr)
-
-    if result.returncode != 0:
-        if verbose:
-            print("failed")
-        return {target_metric_key: nan}
-
-    with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
-        metrics = json.load(f)
-
-    # filter out just the keys we want
-    return {k: v for k, v in metrics.items() if k in metric_keys}
-
-
-def process_run(
-    id,
-    cmd,
-    variation_key,
-    variation,
-    longest_variation_len,
-    target_metric_key,
-    report_metric_keys,
-    repeat_times,
-    output_dir,
-    verbose,
-):
-    results = []
-    metrics = []
-    preamble = f"{id}: {variation:<{longest_variation_len}}"
-    outcome = f"{preamble}: "
-    metric_keys = set(report_metric_keys + [target_metric_key])
-    for i in tqdm(range(repeat_times), desc=preamble, leave=False):
-        single_run_metrics = process_run_single(
-            id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
-        )
-        result = single_run_metrics[target_metric_key]
-        if not math.isnan(result):
-            metrics.append(single_run_metrics)
-            results.append(result)
-            outcome += "✓"
-        else:
-            outcome += "✘"
-    outcome = f"\33[2K\r{outcome}"
-    if len(metrics) > 0:
-        mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
-        mean_target = round(mean_metrics[target_metric_key], 2)
-        results_str = f"{outcome} {mean_target}"
-        if len(metrics) > 1:
-            results_str += f" {tuple(round(x, 2) for x in results)}"
-        print(results_str)
-        mean_metrics[variation_key] = variation
-        return mean_metrics
-    else:
-        print(outcome)
-        return {variation_key: variation, target_metric_key: nan}
-
-
-def get_versions():
-    properties = torch.cuda.get_device_properties(torch.device("cuda"))
-    return f"""
-Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-
-Software:
-transformers: {transformers.__version__}
-torch       : {torch.__version__}
-cuda        : {torch.version.cuda}
-python      : {platform.python_version()}
-
-Hardware:
-{torch.cuda.device_count()} GPUs      : {properties.name}, {properties.total_memory/2**30:0.2f}GB
-"""
-
-
-def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
-
-    df = pd.DataFrame(results)
-    variation_key = "variation"
-    diff_key = "diff_%"
-
-    sentinel_value = nan
-    if base_variation is not None and len(df[df[variation_key] == base_variation]):
-        # this may still return nan
-        sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
-    if math.isnan(sentinel_value):
-        # as a fallback, use the minimal value as the sentinel
-        sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
-
-    # create diff column if possible
-    if not math.isnan(sentinel_value):
-        df[diff_key] = df.apply(
-            lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
-            if not math.isnan(r[target_metric_key])
-            else 0,
-            axis="columns",
-        )
-
-    # re-order columns
-    cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
-    df = df.reindex(cols, axis="columns")  # reorder cols
-
-    # capitalize
-    df = df.rename(str.capitalize, axis="columns")
-
-    # make the cols as narrow as possible
-    df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
-    df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
-
-    report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
-    report += ["----------8<-----------------8<--------"]
-    report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
-    report += ["```"]
-    report += ["*** Setup:", get_versions()]
-    report += ["*** The benchmark command line was:", get_original_command()]
-    report += ["```"]
-    report += ["----------8<-----------------8<--------"]
-    report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
-
-    print("\n\n".join(report))
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base-cmd",
-        default=None,
-        type=str,
-        required=True,
-        help="Base cmd",
-    )
-    parser.add_argument(
-        "--variations",
-        default=None,
-        type=str,
-        nargs="+",
-        required=True,
-        help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
-    )
-    parser.add_argument(
-        "--base-variation",
-        default=None,
-        type=str,
-        help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
-    )
-    parser.add_argument(
-        "--target-metric-key",
-        default=None,
-        type=str,
-        required=True,
-        help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
-    )
-    parser.add_argument(
-        "--report-metric-keys",
-        default="",
-        type=str,
-        help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
-    )
-    parser.add_argument(
-        "--repeat-times",
-        default=1,
-        type=int,
-        help="How many times to re-run each variation - an average will be reported",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="output_benchmark",
-        type=str,
-        help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
-    )
-    parser.add_argument(
-        "--verbose",
-        default=False,
-        action="store_true",
-        help="Whether to show the outputs of each run or just the benchmark progress",
-    )
-    args = parser.parse_args()
-
-    output_dir = args.output_dir
-    Path(output_dir).mkdir(exist_ok=True)
-    base_cmd = get_base_command(args, output_dir)
-
-    # split each dimension into its --foo variations
-    dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
-    # build a cartesian product of dimensions and convert those back into cmd-line arg strings,
-    # while stripping white space for inputs that were empty
-    variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
-    longest_variation_len = max(len(x) for x in variations)
-
-    # split wanted keys
-    report_metric_keys = args.report_metric_keys.split()
-
-    # capture prints into a log file for convenience
-    report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
-    print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
-    print(f"and this script's output is also piped into {report_fn}")
-
-    sys.stdout = Tee(report_fn)
-
-    print(f"\n*** Running {len(variations)} benchmarks:")
-    print(f"Base command: {' '.join(base_cmd)}")
-
-    variation_key = "variation"
-    results = []
-    for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
-        cmd = base_cmd + variation.split()
-        results.append(
-            process_run(
-                id + 1,
-                cmd,
-                variation_key,
-                variation,
-                longest_variation_len,
-                args.target_metric_key,
-                report_metric_keys,
-                args.repeat_times,
-                output_dir,
-                args.verbose,
-            )
-        )
-
-    process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/deberta_scrtipt.py
+++ b/scripts/deberta_scrtipt.py
@ -1,85 +0,0 @@
-import time
-
-import torch
-
-from transformers import AutoModel, AutoTokenizer, pipeline
-
-
-test_sentence = 'Do you [MASK] the muffin man?'
-
-# for comparison
-bert = pipeline('fill-mask', model = 'bert-base-uncased')
-print('\n'.join([d['sequence'] for d in bert(test_sentence)]))
-
-
-deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False})
-print('\n'.join([d['sequence'] for d in deberta(test_sentence)]))
-
-
-tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
-
-tokenized_dict = tokenizer(
-    ["Is this working",], ["Not yet",],
-    return_tensors="pt"
-)
-
-deberta.model.forward = torch.compile(deberta.model.forward)
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-model = AutoModel.from_pretrained('microsoft/deberta-base')
-model.config.return_dict = False
-model.config.output_hidden_states=False
-input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-
-
-start=time.time()
-traced_model = torch.jit.trace(model, input_tuple)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-torch.jit.save(traced_model, "compiled_deberta.pt")
-
-
-
-# my_script_module = torch.jit.script(model)
--- a/scripts/fsmt/convert-allenai-wmt16.sh
+++ b/scripts/fsmt/convert-allenai-wmt16.sh
@ -1,71 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - allenai/wmt16-en-de-dist-12-1
-# - allenai/wmt16-en-de-dist-6-1
-# - allenai/wmt16-en-de-12-1
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU'
-gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU'
-gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r'
-tar -xvzf trans_ende_12-1_0.2.tar.gz
-tar -xvzf trans_ende-dist_12-1_0.2.tar.gz
-tar -xvzf trans_ende-dist_6-1_0.2.tar.gz
-gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9'
-gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj'
-tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz
-tar -xvzf wmt16.en-de.deep-shallow.tar.gz
-cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2
-cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2
-cd -
-
-# run conversions and uploads
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1
-
-
-# upload
-cd data
-transformers-cli upload -y wmt16-en-de-dist-12-1
-transformers-cli upload -y wmt16-en-de-dist-6-1
-transformers-cli upload -y wmt16-en-de-12-1
-cd -
-
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/convert-allenai-wmt19.sh
+++ b/scripts/fsmt/convert-allenai-wmt19.sh
@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - allenai/wmt19-de-en-6-6-base
-# - allenai/wmt19-de-en-6-6-big
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T'
-gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5'
-gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE'
-tar -xvzf wmt19.de-en.tar.gz
-tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz
-tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz
-cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1
-cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2
-cd -
-
-# run conversions and uploads
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big
-
-
-# upload
-cd data
-transformers-cli upload -y wmt19-de-en-6-6-base
-transformers-cli upload -y wmt19-de-en-6-6-big
-cd -
-
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/convert-facebook-wmt19.sh
+++ b/scripts/fsmt/convert-facebook-wmt19.sh
@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - facebook/wmt19-ru-en
-# - facebook/wmt19-en-ru
-# - facebook/wmt19-de-en
-# - facebook/wmt19-en-de
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz
-tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz
-tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz
-tar -xvzf wmt19.en-ru.ensemble.tar.gz
-tar -xvzf wmt19.ru-en.ensemble.tar.gz
-cd -
-
-# run conversions and uploads
-
-export PAIR=ru-en
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=en-ru
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=de-en
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=en-de
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-
-# upload
-cd data
-transformers-cli upload -y wmt19-ru-en
-transformers-cli upload -y wmt19-en-ru
-transformers-cli upload -y wmt19-de-en
-transformers-cli upload -y wmt19-en-de
-cd -
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/eval-allenai-wmt16.sh
+++ b/scripts/fsmt/eval-allenai-wmt16.sh
@ -1,79 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - allenai/wmt16-en-de-dist-12-1
-# - allenai/wmt16-en-de-dist-6-1
-# - allenai/wmt16-en-de-12-1
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### Normal eval ###
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=64
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt16-en-de-dist-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt16-en-de-dist-6-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt16-en-de-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Searching hparams eval ###
-
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=32
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt16-en-de-dist-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-MODEL_PATH=allenai/wmt16-en-de-dist-6-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-MODEL_PATH=allenai/wmt16-en-de-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
--- a/scripts/fsmt/eval-allenai-wmt19.sh
+++ b/scripts/fsmt/eval-allenai-wmt19.sh
@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - allenai/wmt19-de-en-6-6-base
-# - allenai/wmt19-de-en-6-6-big
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### Normal eval ###
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=64
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-base
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-big
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Searching hparams eval ###
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-base
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-big
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
--- a/scripts/fsmt/eval-facebook-wmt19.sh
+++ b/scripts/fsmt/eval-facebook-wmt19.sh
@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - facebook/wmt19-ru-en
-# - facebook/wmt19-en-ru
-# - facebook/wmt19-de-en
-# - facebook/wmt19-en-de
-
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### a short estimate version for quick testing ###
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=8
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Normal eval ###
-
-# ru-en
-
-export PAIR=ru-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
-
-
-# en-ru
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
-
-
-
-# en-de
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
-
-
-# de-en
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
-
-
-### Searching hparams eval ###
-
-# en-ru
-
-export PAIR=ru-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=32
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-# en-ru
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
-
-# en-de
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
-
-# de-en
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
--- a/scripts/fsmt/fsmt-make-super-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-super-tiny-model.py
@ -1,88 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to the check the quality of the outcomes.
-#
-# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
-# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
-# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
-# The latter is done by `fsmt-make-super-tiny-model.py`.
-#
-# It will be used then as "stas/tiny-wmt19-en-ru"
-
-import json
-import tempfile
-from pathlib import Path
-
-from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
-from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
-
-
-mname_tiny = "tiny-wmt19-en-ru"
-
-# Build
-
-# borrowed from a test
-vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
-vocab_tokens = dict(zip(vocab, range(len(vocab))))
-merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-
-with tempfile.TemporaryDirectory() as tmpdirname:
-    build_dir = Path(tmpdirname)
-    src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
-    tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
-    merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
-    with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
-    with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
-    with open(merges_file, "w") as fp   : fp.write("\n".join(merges))
-
-    tokenizer = FSMTTokenizer(
-        langs=["en", "ru"],
-        src_vocab_size = len(vocab),
-        tgt_vocab_size = len(vocab),
-        src_vocab_file=src_vocab_file,
-        tgt_vocab_file=tgt_vocab_file,
-        merges_file=merges_file,
-    )
-
-config = FSMTConfig(
-    langs=['ru', 'en'],
-    src_vocab_size=1000, tgt_vocab_size=1000,
-    d_model=4,
-    encoder_layers=1, decoder_layers=1,
-    encoder_ffn_dim=4, decoder_ffn_dim=4,
-    encoder_attention_heads=1, decoder_attention_heads=1,
-)
-
-tiny_model = FSMTForConditionalGeneration(config)
-print(f"num of params {tiny_model.num_parameters()}")
-
-# Test
-batch = tokenizer(["Making tiny model"], return_tensors="pt")
-outputs = tiny_model(**batch)
-
-print("test output:", len(outputs.logits[0]))
-
-# Save
-tiny_model.half() # makes it smaller
-tiny_model.save_pretrained(mname_tiny)
-tokenizer.save_pretrained(mname_tiny)
-
-print(f"Generated {mname_tiny}")
-
-# Upload
-# transformers-cli upload tiny-wmt19-en-ru
--- a/scripts/fsmt/fsmt-make-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-tiny-model.py
@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to the check the quality of the outcomes.
-#
-# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the
-# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size.
-# This gives ~3MB in total for all files.
-#
-# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated
-#
-#
-# It will be used then as "stas/tiny-wmt19-en-de"
-
-# Build
-from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
-
-
-mname = "facebook/wmt19-en-de"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-# get the correct vocab sizes, etc. from the master model
-config = FSMTConfig.from_pretrained(mname)
-config.update({
-    "d_model": 4,
-    "encoder_layers": 1, "decoder_layers": 1,
-    "encoder_ffn_dim": 4, "decoder_ffn_dim": 4,
-    "encoder_attention_heads": 1, "decoder_attention_heads": 1})
-
-tiny_model = FSMTForConditionalGeneration(config)
-print(f"num of params {tiny_model.num_parameters()}")
-
-# Test
-batch = tokenizer(["Making tiny model"], return_tensors="pt")
-outputs = tiny_model(**batch)
-
-print("test output:", len(outputs.logits[0]))
-
-# Save
-mname_tiny = "tiny-wmt19-en-de"
-tiny_model.half() # makes it smaller
-tiny_model.save_pretrained(mname_tiny)
-tokenizer.save_pretrained(mname_tiny)
-
-print(f"Generated {mname_tiny}")
-
-# Upload
-# transformers-cli upload tiny-wmt19-en-de
--- a/scripts/fsmt/gen-card-allenai-wmt16.py
+++ b/scripts/fsmt/gen-card-allenai-wmt16.py
@ -1,156 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-allenai-wmt16.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, nicht wahr?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "wmt16-en-de-dist-12-1": [28.3, 27.52],
-        "wmt16-en-de-dist-6-1": [27.4, 27.11],
-        "wmt16-en-de-12-1": [26.9, 25.75],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt16
- allenai
-license: apache-2.0
-datasets:
- wmt16
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
-
-For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
-
-All 3 models are available:
-
-* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1)
-* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1)
-* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1)
-
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "allenai/{model_name}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
-
-## Training data
-
-Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
-
-## Eval results
-
-Here are the BLEU scores:
-
-model   | fairseq | transformers
-------|---------|----------
-{model_name}  | {scores[model_name][0]} | {scores[model_name][1]}
-
-The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs.
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt16/)
- [test set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372)
-
-
-### BibTeX entry and citation info
-
-```
-@misc{{kasai2020deep,
-    title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
-    author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
-    year={{2020}},
-    eprint={{2006.10369}},
-    archivePrefix={{arXiv}},
-    primaryClass={{cs.CL}}
-}}
-```
-
-"""
-    model_card_dir.mkdir(parents=True, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]:
-    model_card_dir = model_cards_dir / "allenai" / model_name
-    write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name)
--- a/scripts/fsmt/gen-card-allenai-wmt19.py
+++ b/scripts/fsmt/gen-card-allenai-wmt19.py
@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-allenai-wmt19.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, nicht wahr?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "wmt19-de-en-6-6-base": [0, 38.37],
-        "wmt19-de-en-6-6-big": [0, 39.90],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt19
- allenai
-license: apache-2.0
-datasets:
- wmt19
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of fairseq-based [wmt19 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
-
-For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
-
-2 models are available:
-
-* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big)
-* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base)
-
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "allenai/{model_name}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
-
-## Training data
-
-Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
-
-## Eval results
-
-Here are the BLEU scores:
-
-model   |  transformers
-------|---------
-{model_name}  |  {scores[model_name][1]}
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
-
-
-### BibTeX entry and citation info
-
-```
-@misc{{kasai2020deep,
-    title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
-    author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
-    year={{2020}},
-    eprint={{2006.10369}},
-    archivePrefix={{arXiv}},
-    primaryClass={{cs.CL}}
-}}
-```
-
-"""
-    model_card_dir.mkdir(parents=True, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]:
-    model_card_dir = model_cards_dir / "allenai" / model_name
-    write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name)
--- a/scripts/fsmt/gen-card-facebook-wmt19.py
+++ b/scripts/fsmt/gen-card-facebook-wmt19.py
@ -1,165 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-facebook-wmt19.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, oder?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"],
-        "en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"],
-        "en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"],
-        "de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt19
- facebook
-license: apache-2.0
-datasets:
- wmt19
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}.
-
-For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
-
-The abbreviation FSMT stands for FairSeqMachineTranslation
-
-All four models are available:
-
-* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
-* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
-* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
-* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "facebook/wmt19-{src_lang}-{tgt_lang}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
-
-## Training data
-
-Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
-
-## Eval results
-
-pair   | fairseq | transformers
-------|---------|----------
-{pair}  | {scores[pair][0]} | {scores[pair][1]}
-
-The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
- re-ranking
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=15
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
-
-
-### BibTeX entry and citation info
-
-```bibtex
-@inproceedings{{...,
-  year={{2020}},
-  title={{Facebook FAIR's WMT19 News Translation Task Submission}},
-  author={{Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey}},
-  booktitle={{Proc. of WMT}},
-}}
-```
-
-
-## TODO
-
- port model ensemble (fairseq uses 4 model checkpoints)
-
-"""
-    os.makedirs(model_card_dir, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
-    base, src_lang, tgt_lang = model_name.split("-")
-    model_card_dir = model_cards_dir / "facebook" / model_name
-    write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)
--- a/scripts/fsmt/s3-move.sh
+++ b/scripts/fsmt/s3-move.sh
@ -1,116 +0,0 @@
-
-# this is the process of uploading the updated models to s3. As I can't upload them directly to the correct orgs, this script shows how this is done
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-1. upload updated models to my account
-
-transformers-cli upload -y wmt19-ru-en
-transformers-cli upload -y wmt19-en-ru
-transformers-cli upload -y wmt19-de-en
-transformers-cli upload -y wmt19-en-de
-transformers-cli upload -y wmt19-de-en-6-6-base
-transformers-cli upload -y wmt19-de-en-6-6-big
-transformers-cli upload -y wmt16-en-de-dist-12-1
-transformers-cli upload -y wmt16-en-de-dist-6-1
-transformers-cli upload -y wmt16-en-de-12-1
-
-
-2. ask someone to move them to:
-
-* to facebook: "wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"
-* to allenai: "wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1", "wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"
-
-export b="s3://models.huggingface.co/bert"
-stas_to_fb () {
-	src=$1
-	shift
-	aws s3 sync $b/stas/$src $b/facebook/$src $@
-}
-
-stas_to_allenai () {
-	src=$1
-	shift
-	aws s3 sync $b/stas/$src $b/allenai/$src $@
-}
-
-stas_to_fb wmt19-en-ru
-stas_to_fb wmt19-ru-en
-stas_to_fb wmt19-en-de
-stas_to_fb wmt19-de-en
-
-stas_to_allenai wmt16-en-de-dist-12-1
-stas_to_allenai wmt16-en-de-dist-6-1
-stas_to_allenai wmt16-en-de-6-1
-stas_to_allenai wmt16-en-de-12-1
-stas_to_allenai wmt19-de-en-6-6-base
-stas_to_allenai wmt19-de-en-6-6-big
-
-
-3. and then remove all these model files from my account
-
-transformers-cli s3 rm wmt16-en-de-12-1/config.json
-transformers-cli s3 rm wmt16-en-de-12-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-12-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-12-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-12-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-12-1/vocab-tgt.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/config.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-dist-12-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-dist-12-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-tgt.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/config.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-dist-6-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-dist-6-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/config.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/merges.txt
-transformers-cli s3 rm wmt19-de-en-6-6-base/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en-6-6-base/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-src.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/config.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/merges.txt
-transformers-cli s3 rm wmt19-de-en-6-6-big/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en-6-6-big/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-src.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en/config.json
-transformers-cli s3 rm wmt19-de-en/merges.txt
-transformers-cli s3 rm wmt19-de-en/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en/vocab-src.json
-transformers-cli s3 rm wmt19-de-en/vocab-tgt.json
-transformers-cli s3 rm wmt19-en-de/config.json
-transformers-cli s3 rm wmt19-en-de/merges.txt
-transformers-cli s3 rm wmt19-en-de/pytorch_model.bin
-transformers-cli s3 rm wmt19-en-de/tokenizer_config.json
-transformers-cli s3 rm wmt19-en-de/vocab-src.json
-transformers-cli s3 rm wmt19-en-de/vocab-tgt.json
-transformers-cli s3 rm wmt19-en-ru/config.json
-transformers-cli s3 rm wmt19-en-ru/merges.txt
-transformers-cli s3 rm wmt19-en-ru/pytorch_model.bin
-transformers-cli s3 rm wmt19-en-ru/tokenizer_config.json
-transformers-cli s3 rm wmt19-en-ru/vocab-src.json
-transformers-cli s3 rm wmt19-en-ru/vocab-tgt.json
-transformers-cli s3 rm wmt19-ru-en/config.json
-transformers-cli s3 rm wmt19-ru-en/merges.txt
-transformers-cli s3 rm wmt19-ru-en/pytorch_model.bin
-transformers-cli s3 rm wmt19-ru-en/tokenizer_config.json
-transformers-cli s3 rm wmt19-ru-en/vocab-src.json
-transformers-cli s3 rm wmt19-ru-en/vocab-tgt.json
--- a/scripts/pegasus/build_test_sample_spm_no_bos.py
+++ b/scripts/pegasus/build_test_sample_spm_no_bos.py
@ -1,34 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
-
-# 1. pip install sentencepiece
-#
-# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
-
-# 3. build
-import sentencepiece as spm
-
-
-# pegasus:
-# 1. no bos
-# 2. eos_id is 1
-# 3. unk_id is 2
-# build a sample spm file accordingly
-spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2  --eos_id=1  --vocab_size=1000')
-
-# 4. now update the fixture
-# mv test_sentencepiece_no_bos.model ../../tests/fixtures/
--- a/scripts/tatoeba/README.md
+++ b/scripts/tatoeba/README.md
@ -1,72 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-Setup transformers following instructions in README.md, (I would fork first).
-```bash
-git clone git@github.com:huggingface/transformers.git
-cd transformers
-pip install -e .
-pip install pandas GitPython wget
-```
-
-Get required metadata
-```bash
-curl https://cdn-datasets.huggingface.co/language_codes/language-codes-3b2.csv  > language-codes-3b2.csv
-curl https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv > iso-639-3.csv
-```
-
-Install Tatoeba-Challenge repo inside transformers
-```bash
-git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git
-```
-
-To convert a few models, call the conversion script from command line:
-```bash
-python src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py --models heb-eng eng-heb --save_dir converted
-```
-
-To convert lots of models you can pass your list of Tatoeba model names to `resolver.convert_models` in a python client or script.
-
-```python
-from transformers.convert_marian_tatoeba_to_pytorch import TatoebaConverter
-resolver = TatoebaConverter(save_dir='converted')
-resolver.convert_models(['heb-eng', 'eng-heb'])
-```
-
-
-### Upload converted models
-Since version v3.5.0, the model sharing workflow is switched to git-based system . Refer to [model sharing doc](https://huggingface.co/transformers/main/model_sharing.html#model-sharing-and-uploading) for more details.
-
-To upload all converted models, 
-
-1. Install [git-lfs](https://git-lfs.github.com/).
-
-2. Login to `huggingface-cli`
-
-```bash
-huggingface-cli login
-```
-
-3. Run the `upload_models` script
-
-```bash
-./scripts/tatoeba/upload_models.sh
-```
-
-
-### Modifications
- To change naming logic, change the code near `os.rename`. The model card creation code may also need to change.
- To change model card content, you must modify `TatoebaCodeResolver.write_model_card`
--- a/scripts/tatoeba/upload_models.sh
+++ b/scripts/tatoeba/upload_models.sh
@ -1,12 +0,0 @@
-#!/bin/bash
-
-for FILE in converted/*; do 
-  model_name=`basename $FILE`
-  huggingface-cli repo create $model_name -y
-  git clone https://huggingface.co/Helsinki-NLP/$model_name
-  mv $FILE/* $model_name/
-  cd $model_name
-  git add . && git commit -m "initial commit" 
-  git push
-  cd ..
-done
--- a/setup.py
+++ b/setup.py
@ -163,6 +163,9 @@ _deps = [
    "rjieba",
    "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
    "ruff==0.11.2",
+    # `sacrebleu` not used in `transformers`. However, it is needed in several tests, when a test calls
+    # `evaluate.load("sacrebleu")`. This metric is used in the examples that we use to test the `Trainer` with, in the
+    # `Trainer` tests (see references to `run_translation.py`).
    "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses",
    "safetensors>=0.4.3",
@ -344,7 +347,6 @@ extras["testing"] = (
        "evaluate",
        "pytest-timeout",
        "ruff",
-        "sacrebleu",
        "rouge-score",
        "nltk",
        "GitPython",
@ -354,6 +356,7 @@ extras["testing"] = (
        "tensorboard",
        "pydantic",
        "sentencepiece",
+        "sacrebleu",  # needed in trainer tests, see references to `run_translation.py`
    )
    + extras["retrieval"]
    + extras["modelcreation"]
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -259,6 +259,7 @@ _import_structure = {
    ],
    "utils.quantization_config": [
        "AqlmConfig",
+        "AutoRoundConfig",
        "AwqConfig",
        "BitNetConfig",
        "BitsAndBytesConfig",
@ -344,7 +345,6 @@ except OptionalDependencyNotAvailable:
    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
    _import_structure["model_debugging_utils"] = [
-        "model_addition_debugger",
        "model_addition_debugger_context",
    ]
    _import_structure["activations"] = []
@ -439,6 +439,7 @@ else:
    ]

    _import_structure["modeling_flash_attention_utils"] = []
+    _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
    _import_structure["modeling_outputs"] = []
    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"]
    _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
@ -754,6 +755,7 @@ if TYPE_CHECKING:
    # bitsandbytes config
    from .utils.quantization_config import (
        AqlmConfig,
+        AutoRoundConfig,
        AwqConfig,
        BitNetConfig,
        BitsAndBytesConfig,
@ -910,9 +912,9 @@ if TYPE_CHECKING:
            convert_and_export_with_cache,
        )
        from .model_debugging_utils import (
-            model_addition_debugger,
            model_addition_debugger_context,
        )
+        from .modeling_layers import GradientCheckpointingLayer
        from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
        from .modeling_utils import AttentionInterface, PreTrainedModel

--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -1654,9 +1654,7 @@ class HybridCache(Cache):
        ```
    """

-    # TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
-    # ALL changes from the PR that commented the line below when reactivating it.
-    # is_compileable = True
+    is_compileable = True

    def __init__(
        self,
@ -1858,8 +1856,6 @@ class HybridChunkedCache(Cache):
        ```
    """

-    # TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
-    # ALL changes from the PR that commented the line below when reactivating it.
    is_compileable = True

    def __init__(
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -292,7 +292,8 @@ class TemperatureLogitsWarper(LogitsProcessor):
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    [`LogitsProcessor`] that prevents the repetition of previous tokens through a penalty. This penalty is applied at
-    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt.
+    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt
+    by default.

    In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a penalty of around
    1.2 to achieve a good balance between truthful generation and lack of repetition. To penalize and reduce
@ -303,11 +304,13 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
        penalty (`float`):
            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated
            tokens. Between 0.0 and 1.0 rewards previously generated tokens.
+        prompt_ignore_length (`int`, *optional*):
+            The original input ids sequence length, which if provided, will not be used in the penalty calculation.

    Examples:

    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, RepetitionPenaltyLogitsProcessor

    >>> # Initializing the model and tokenizer for it
    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
@ -323,17 +326,36 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    >>> penalized_ids = model.generate(**inputs, repetition_penalty=1.1)
    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
    I'm not going to be able to do that. I'll just have to go out and play
+
+    >>> # We can also exclude the input prompt by creating an instance of this class
+    >>> # with a `prompt_ignore_length` and passing it as a custom logit processor
+    >>> rep_pen_processor = RepetitionPenaltyLogitsProcessor(
+    ...     penalty=1.1,
+    ...     prompt_ignore_length=inputs["input_ids"].shape[-1]
+    ... )
+    >>> penalized_ids = model.generate(**inputs, logits_processor=[rep_pen_processor])
+    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'm going to have to go through a lot of things, and
    ```
    """

-    def __init__(self, penalty: float):
+    def __init__(self, penalty: float, prompt_ignore_length: Optional[int] = None):
        if not isinstance(penalty, float) or not (penalty > 0):
            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")

+        if prompt_ignore_length is not None and (
+            not isinstance(prompt_ignore_length, int) or prompt_ignore_length < 0
+        ):
+            raise ValueError(f"`prompt_ignore_length` has to be a positive integer, but is {prompt_ignore_length}")
+
        self.penalty = penalty
+        self.prompt_ignore_length = prompt_ignore_length

    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if self.prompt_ignore_length:
+            input_ids = input_ids[:, self.prompt_ignore_length :]
+
        score = torch.gather(scores, 1, input_ids)

        # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -563,17 +563,17 @@ class GenerationMixin:
                device = model_inputs[input_ids_key].device

            # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
-            # the 4D causal mask exists, it should be present in the base model (XXXModel class).
-            base_model = getattr(self, self.base_model_prefix, None)
-            if base_model is None:
+            # the 4D causal mask exists, it should be present in the base model (XXXModel class) or in its decoder.
+            base_model = getattr(self, self.base_model_prefix, self)
+            decoder = base_model.get_decoder() if hasattr(base_model, "get_decoder") else None
+            causal_mask_creation_function = getattr(
+                base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
+            )
+            if causal_mask_creation_function is None and decoder is not None:  # it may be in the decoder
                causal_mask_creation_function = getattr(
-                    self, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                    decoder, "_prepare_4d_causal_attention_mask_with_cache_position", None
                )
-            else:
-                causal_mask_creation_function = getattr(
-                    base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
-                )
-            if causal_mask_creation_function is None:
+            if causal_mask_creation_function is None:  # can't be found
                logger.warning_once(
                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -18,7 +18,7 @@ from collections.abc import Iterable
 from contextlib import redirect_stdout
 from dataclasses import dataclass
 from io import BytesIO
-from typing import TYPE_CHECKING, Callable, Optional, Union
+from typing import Callable, Optional, Union
 from urllib.parse import urlparse

 import numpy as np
@ -77,9 +77,8 @@ if is_vision_available():
        pil_torch_interpolation_mapping = {}


-if TYPE_CHECKING:
-    if is_torch_available():
-        import torch
+if is_torch_available():
+    import torch


 logger = logging.get_logger(__name__)
@ -162,6 +161,15 @@ def is_valid_list_of_images(images: list):
    return images and all(is_valid_image(image) for image in images)


+def concatenate_list(input_list):
+    if isinstance(input_list[0], list):
+        return [item for sublist in input_list for item in sublist]
+    elif isinstance(input_list[0], np.ndarray):
+        return np.concatenate(input_list, axis=0)
+    elif isinstance(input_list[0], torch.Tensor):
+        return torch.cat(input_list, dim=0)
+
+
 def valid_images(imgs):
    # If we have an list of images, make sure every image is valid
    if isinstance(imgs, (list, tuple)):
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -13,6 +13,8 @@
 # limitations under the License.
 from typing import Dict, Union

+from ..utils import is_torchdynamo_compiling
+

 try:
    from kernels import (
@ -20,7 +22,9 @@ try:
        LayerRepository,
        register_kernel_mapping,
        replace_kernel_forward_from_hub,
-        use_kernel_forward_from_hub,
+    )
+    from kernels import (
+        use_kernel_forward_from_hub as original_use_kernel_forward_from_hub,
    )

    _hub_kernels_available = True
@ -56,6 +60,40 @@ try:

    register_kernel_mapping(_KERNEL_MAPPING)

+    def use_kernel_forward_from_hub(*args, **kwargs):
+        """
+        Expands `kernels`' `use_kernel_forward_from_hub` to NOT use a kernel at compile time. This should be removed
+        when `kernels` supports `torch.compile`.
+
+        If the layer has a `config` attribute, we can also set `config.disable_custom_kernels = True` to disable the
+        kernel.
+        """
+
+        def decorator_with_compile_path(cls):
+            # Keeps a reference to the original forward method
+            original_forward = cls.forward
+
+            # Applies the original decorator
+            decorator = original_use_kernel_forward_from_hub(*args, **kwargs)
+            cls = decorator(cls)
+
+            # Replaces the kernel forward with a compile-friendly version
+            kernel_forward = cls.forward
+
+            def forward_with_compile_path(*forward_args, **forward_kwargs):
+                disable_custom_kernels = hasattr(cls, "config") and getattr(cls.config, "disable_custom_kernels", None)
+                if is_torchdynamo_compiling() or disable_custom_kernels:
+                    return original_forward(*forward_args, **forward_kwargs)
+                else:
+                    return kernel_forward(*forward_args, **forward_kwargs)
+
+            cls.forward = forward_with_compile_path
+
+            return cls
+
+        return decorator_with_compile_path
+
+
 except ImportError:
    # Stub to make decorators int transformers work when `kernels`
    # is not installed.
--- a/src/transformers/integrations/npu_flash_attention.py
+++ b/src/transformers/integrations/npu_flash_attention.py
@ -19,6 +19,8 @@ from ..utils.import_utils import is_torch_npu_available


 if is_torch_npu_available():
+    import math
+
    import torch_npu
    from einops import rearrange, repeat

@ -162,6 +164,9 @@ def npu_flash_attn_func(
 ):
    keep_prob = 1.0 - dropout_p

+    if softmax_scale is None:
+        softmax_scale = 1.0 / math.sqrt(q.shape[-1])
+
    if not causal:
        head_num = q.shape[2]
        output = torch_npu.npu_fusion_attention(q, k, v, head_num, "BSND", keep_prob=keep_prob, scale=softmax_scale)[0]
@ -189,6 +194,8 @@ def npu_flash_attn_varlen_func(
    v,
    cu_seqlens_q,
    cu_seqlens_k,
+    max_seqlen_q=None,  # defined for aligning params order with corresponding function in `flash-attn`
+    max_seqlen_k=None,  # defined for aligning params order with corresponding function in `flash-attn`
    dropout_p=0.0,
    softmax_scale=None,
    causal=False,
@ -196,6 +203,9 @@ def npu_flash_attn_varlen_func(
 ):
    keep_prob = 1.0 - dropout_p

+    if softmax_scale is None:
+        softmax_scale = 1.0 / math.sqrt(q.shape[-1])
+
    if not causal:
        head_num = q.shape[1]
        output = torch_npu.npu_fusion_attention(
--- a/src/transformers/model_debugging_utils.py
+++ b/src/transformers/model_debugging_utils.py
@ -17,7 +17,8 @@ import functools
 import json
 import os
 import re
-from contextlib import contextmanager
+from contextlib import contextmanager, redirect_stdout
+from io import StringIO
 from typing import Optional

 from transformers.utils.import_utils import requires
@ -28,9 +29,7 @@ from .utils import is_torch_available
 if is_torch_available():
    import torch
    import torch.distributed.tensor
-    from torch import nn

-    from .modeling_utils import PreTrainedModel

 from .utils import logging

@ -87,21 +86,64 @@ def _serialize_io(value):

    if hasattr(value, "_local_tensor"):
        # DTensor-like handling, just use local tensor attribute
-        return {
+        torch.set_printoptions(sci_mode=True)
+        val_repr = _repr_to_list(value)
+        out = {
            "shape": repr(value._local_tensor.shape),
            "dtype": repr(value._local_tensor.dtype),
-            "value": _sanitize_repr_for_diff(repr(value)),
+            "value": val_repr,
        }
+        if value._local_tensor.dtype in {torch.float16, torch.float32, torch.bfloat16}:
+            value = value._local_tensor.clone()
+            out.update(
+                {
+                    "mean": _sanitize_repr_for_diff(repr(value.mean())),
+                    "std": _sanitize_repr_for_diff(repr(value.std())),
+                    "min": _sanitize_repr_for_diff(repr(value.min())),
+                    "max": _sanitize_repr_for_diff(repr(value.max())),
+                }
+            )
+        return out

    if isinstance(value, torch.Tensor):
-        # standard PyTorch Tensor
-        # return also the shape of such
-        return {"shape": repr(value.shape), "dtype": repr(value.dtype), "value": _sanitize_repr_for_diff(repr(value))}
+        torch.set_printoptions(sci_mode=True)
+        val_repr = _repr_to_list(value)
+        out = {
+            "shape": repr(value.shape),
+            "dtype": repr(value.dtype),
+            "value": val_repr,
+        }
+        if value.dtype in {torch.float16, torch.float32, torch.bfloat16}:
+            out.update(
+                {
+                    "mean": _sanitize_repr_for_diff(repr(value.mean())),
+                    "std": _sanitize_repr_for_diff(repr(value.std())),
+                    "min": _sanitize_repr_for_diff(repr(value.min())),
+                    "max": _sanitize_repr_for_diff(repr(value.max())),
+                }
+            )
+        return out

-    # fallback for everything else (bool, int, float, None, or custom class)
    return _sanitize_repr_for_diff(repr(value))


+def _repr_to_list(value: torch.Tensor):
+    """
+    Converts a tensor into a sanitized multi-line string representation.
+
+    Args:
+        value (`torch.Tensor`): The tensor to represent.
+
+    Returns:
+        `List[str]`: List of string lines representing the tensor.
+    """
+    torch.set_printoptions(sci_mode=True, linewidth=120)
+    with StringIO() as buf, redirect_stdout(buf):
+        print(value)  # to redirected stdout to avoid line splits
+        raw = buf.getvalue()
+    return _sanitize_repr_for_diff(raw).splitlines()
+
+
 def prune_outputs_if_children(node):
    # if there are children, remove this node's "outputs"
    # so we only see outputs at the leaf level
@ -111,22 +153,106 @@ def prune_outputs_if_children(node):
            prune_outputs_if_children(child)


+LAYER_SUFFIX_RE = re.compile(r"(.*)\.(\d+)$")  # should be generic enough, ends with a number
+
+
+def is_layer_block(node):
+    """
+    Checks whether a node represents a layer block with submodules.
+
+    Args:
+        node (`dict`): A node from the call tree.
+
+    Returns:
+        `bool`: Whether the node is a layer block.
+    """
+    match = LAYER_SUFFIX_RE.match(node.get("module_path", ""))
+    if not match or not node.get("children"):
+        return False
+    number = match.group(2)
+    return any(f".{number}." in child.get("module_path", "") for child in node["children"])
+
+
+def prune_intermediate_layers(node):
+    """
+    Recursively removes intermediate layers from the tree to improve readability.
+    Keeps at least the first and last layers if many consecutive layers are present.
+
+    Args:
+        node (`dict`): The root or subnode to prune recursively.
+    """
+    if not node.get("children"):
+        return
+    layer_blocks = [(i, child) for i, child in enumerate(node["children"]) if is_layer_block(child)]
+
+    if len(layer_blocks) > 2:
+        to_remove = [i for i, _ in layer_blocks[1:-1]]
+        node["children"] = [child for i, child in enumerate(node["children"]) if i not in to_remove]
+
+    for child in node["children"]:
+        prune_intermediate_layers(child)
+
+
 def log_model_debug_trace(debug_path, model):
    if debug_path:
        try:
-            os.makedirs(debug_path, exist_ok=False)
-            output_path = os.path.join(debug_path, model._debugger_module_dump_name + "_debug_tree.json")
+            os.makedirs(debug_path, exist_ok=True)
+            base = os.path.join(debug_path, model._debugger_module_dump_name + "_debug_tree")
        except Exception as e:
            raise ValueError(f"Unexpected or existing debug_path={debug_path}. {e}")
    else:
-        output_path = model._debugger_module_dump_name + "_debug_tree.json"
-    logger.info(f"Writing model trace at {output_path}")
-    with open(output_path, "w") as outfile:
-        prune_outputs_if_children(model._call_tree)
-        json.dump(model._call_tree, outfile, indent=2)
+        base = model._debugger_module_dump_name + "_debug_tree"
+
+    logger.info(f"Writing model trace at {base}.json")
+    full_path = base + "_FULL_TENSORS.json"
+    summary_path = base + "_SUMMARY.json"
+
+    prune_outputs_if_children(model._call_tree)
+
+    with open(full_path, "w") as f:
+        json.dump(model._call_tree, f, indent=2)
+
+    # summary-only version for readability - traversing the tree again #TODO optimize?
+    def strip_values(node):
+        def clean(val):
+            if isinstance(val, dict):
+                val.pop("value", None)
+                for v in val.values():
+                    clean(v)
+            elif isinstance(val, list):
+                for item in val:
+                    clean(item)
+
+        clean(node.get("inputs", {}))
+        clean(node.get("outputs", {}))
+
+        for child in node.get("children", []):
+            strip_values(child)
+
+    tree_copy = json.loads(json.dumps(model._call_tree))  # deep copy
+    strip_values(tree_copy)
+
+    with open(summary_path, "w") as f:
+        json.dump(tree_copy, f, indent=2)


-def _attach_debugger_logic(model, class_name, debug_path: str):
+def _attach_debugger_logic(
+    model,
+    debug_path: Optional[str] = ".",
+    do_prune_layers: Optional[bool] = True,
+):
+    """
+    Attaches a debugging wrapper to every module in the model.
+
+    This records structured inputs and outputs during the forward pass into a call tree.
+
+    Args:
+        model (`PreTrainedModel`, `nn.Module`): Model to wrap.
+        debug_path (`str`): Optional directory to dump debug JSON files.
+        do_prune_layers (`bool`, *optional*, defaults to `True`): Whether to prune intermediate layers.
+    """
+    class_name = model.__class__.__name__
+
    # Prepare data structures on the model object
    model._call_tree = {"module_path": class_name, "inputs": None, "outputs": None, "children": []}
    model._debugger_model_call_stack = []
@ -147,7 +273,7 @@ def _attach_debugger_logic(model, class_name, debug_path: str):
                    "children": [],
                }
                model._debugger_model_call_stack.append(node)
-            with torch.inference_mode():
+            with torch.no_grad():
                out = orig_forward(*inps, **kws)

            if _is_rank_zero():
@ -188,7 +314,6 @@ def _attach_debugger_logic(model, class_name, debug_path: str):
            model._debugger_model_call_stack.append(top_node)

        out = real_top_forward(*inps, **kws)
-
        if _is_rank_zero() and model._debugger_model_call_stack:
            top_node["outputs"] = _serialize_io(out)
            finished = model._debugger_model_call_stack.pop()
@ -198,98 +323,24 @@ def _attach_debugger_logic(model, class_name, debug_path: str):
            # prune empty stuff for visibility
            [model._call_tree.pop(k, None) for k in list(model._call_tree.keys()) if not model._call_tree[k]]

+            # prune layers that are not 0 or last
+            if do_prune_layers:
+                prune_intermediate_layers(model._call_tree)
+            # Write final JSON trace here
+            log_model_debug_trace(debug_path=debug_path, model=model)
        return out

    model.forward = top_wrapped_forward

-    # Final hook for writing JSON on forward-end
-    def final_hook(_, inputs, outputs):
-        if _is_rank_zero() and model._debugger_model_call_stack:
-            finished = model._debugger_model_call_stack.pop()
-            model._call_tree["inputs"] = finished["inputs"]
-            model._call_tree["outputs"] = finished["outputs"]
-            model._call_tree["children"] = finished["children"]
-
-        if _is_rank_zero():
-            log_model_debug_trace(debug_path=debug_path, model=model)
-
-    model.register_forward_hook(final_hook)
-    # Optionally also for a couple possible hooks that have specific names. It should be just one.
-    # This means modules that are not typically called "forward" within the model. But we should not need to recurse
-    # through them.
-    possible_model_calls = ["language_model", "model"]
-    for model_call in possible_model_calls:
-        this_model_call = getattr(model, model_call, None)
-        if this_model_call and isinstance(this_model_call, (nn.Module, PreTrainedModel)):
-            this_model_call.register_forward_hook(final_hook)
-            break  # exit the loop after finding one (unsure, but should be just one call.)
-
-
-@requires(backends=("torch",))
-def model_addition_debugger(cls):
-    """
-    # Model addition debugger - a model adder tracer
-    This decorator is a power user tool intended for model adders.
-    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-    To note, this decorator enforces `torch.inference_mode()`.
-    ## Usage
-
-    add decorator to your model class
-    ```python
-    from ...modeling_utils import model_addition_debugger
-
-    @model_addition_debugger
-    class MyModel(nn.Module) # Can inherit from PreTrainedModel too
-        # ... nothing else changes
-    ```
-    Then, in a separate script (example is for Llava)
-
-    ```python
-    import torch
-    from PIL import Image
-    import requests
-    from transformers import LlavaProcessor, LlavaForConditionalGeneration
-    torch.random.manual_seed(673)
-
-    # load pretrained model and processor
-    model_id = "llava-hf/llava-1.5-7b-hf"
-    processor = LlavaProcessor.from_pretrained(model_id)
-    model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
-
-    # create random image input
-    random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
-
-    # prompt
-    prompt = "<image>Describe this image."
-
-    # process inputs
-    inputs = processor(text=prompt, images=random_image, return_tensors="pt")
-
-    # call forward method (not .generate!)
-    with torch.no_grad():
-        output = model.forward(**inputs)
-    ```
-
-    """
-    orig_init = cls.__init__
-
-    @functools.wraps(cls.__init__)
-    def wrapped_init(self, *args, **kwargs):
-        orig_init(self, *args, **kwargs)
-        _attach_debugger_logic(self, cls.__name__)
-
-    cls.__init__ = wrapped_init
-    return cls
-

@requires(backends=("torch",))
@contextmanager
-def model_addition_debugger_context(model, debug_path: Optional[str] = None):
+def model_addition_debugger_context(model, debug_path: Optional[str] = None, do_prune_layers: Optional[bool] = True):
    """
    # Model addition debugger - context manager for model adders
    This context manager is a power user tool intended for model adders.
    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-    To note, this context manager enforces `torch.inference_mode()`.
+    To note, this context manager enforces `torch.no_grad()`.

    ## Usage

@ -300,6 +351,7 @@ def model_addition_debugger_context(model, debug_path: Optional[str] = None):
    from PIL import Image
    import requests
    from transformers import LlavaProcessor, LlavaForConditionalGeneration
+    from transformers.model_debugging_utils import model_addition_debugger_context
    torch.random.manual_seed(673)

    # load pretrained model and processor
@ -317,13 +369,16 @@ def model_addition_debugger_context(model, debug_path: Optional[str] = None):
    inputs = processor(text=prompt, images=random_image, return_tensors="pt")

    # call forward method (not .generate!)
-    with model_addition_debugger_context(model):
+    with model_addition_debugger_context(model, debug_path="Your_debug_path", do_prune_layers=False):
        output = model.forward(**inputs)
    ```

    """
-    _attach_debugger_logic(model, model.__class__.__name__, debug_path)
+    orig_forwards = {m: m.forward for _, m in model.named_modules()}
+    orig_forwards[model] = model.forward
+    _attach_debugger_logic(model, debug_path, do_prune_layers)
    try:
        yield model
    finally:
-        pass
+        for module_instance, forward_method in orig_forwards.items():
+            module_instance.forward = forward_method
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@ -258,6 +258,8 @@ TENSOR_PROCESSORS = {


 def read_field(reader, field):
+    if field not in reader.fields:
+        return []
    value = reader.fields[field]
    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]

@ -369,6 +371,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
    parsed_parameters = {k: {} for k in GGUF_TO_TRANSFORMERS_MAPPING}

    architecture = read_field(reader, "general.architecture")[0]
+    # NOTE: Some GGUF checkpoints may miss `general.name` field in metadata
    model_name = read_field(reader, "general.name")

    updated_architecture = None
--- a/src/transformers/modeling_layers.py
+++ b/src/transformers/modeling_layers.py
@ -0,0 +1,48 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch.nn as nn
+
+
+class GradientCheckpointingLayer(nn.Module):
+    """Base class for layers with gradient checkpointing.
+
+    This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled
+    (`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is
+    enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`.
+
+    Important:
+
+        When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states)
+        must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients.
+
+        Example:
+
+            ```python
+            >>> # Correct - hidden_states passed as positional arg
+            >>> out = self.layer(hidden_states, attention_mask=attention_mask)
+
+            >>> # Incorrect - hidden_states passed as keyword arg
+            >>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask)
+            ```
+    """
+
+    gradient_checkpointing = False
+
+    def __call__(self, *args, **kwargs):
+        if self.gradient_checkpointing and self.training:
+            return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
+        return super().__call__(*args, **kwargs)
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -4167,15 +4167,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
            _adapter_model_path = None

        # Potentially detect context manager or global device, and use it (only if no device_map was provided)
-        if device_map is None:
+        if device_map is None and not is_deepspeed_zero3_enabled():
            device_in_context = get_torch_context_manager_or_global_device()
            if device_in_context == torch.device("meta"):
-                raise ValueError(
-                    (
-                        "`from_pretrained` is not compatible with a meta device context manager or `torch.set_default_device('meta')` "
-                        "as its purpose is to load weights. If you want to initialize a model on the meta device, use the context manager "
-                        "or global device with `from_config`, or `ModelClass(config)`"
-                    )
+                # TODO Cyril: raise an error instead of the warning in v4.53 (and change the test to check for raise instead of success)
+                logger.warning(
+                    "We detected that you are using `from_pretrained` with a meta device context manager or `torch.set_default_device('meta')`\n"
+                    "This is an anti-pattern and will raise an Error in version v4.53\nIf you want to initialize a model on the meta device, use "
+                    "the context manager or global device with `from_config`, or `ModelClass(config)`"
                )
            device_map = device_in_context

@ -4867,7 +4866,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
        # Warmup cuda to load the weights much faster on devices
        if device_map is not None and not is_hqq_or_quark:
            expanded_device_map = expand_device_map(device_map, expected_keys)
-            caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4)
+            caching_allocator_warmup(model_to_load, expanded_device_map, hf_quantizer)

        error_msgs = []
        # Iterate on all the shards to load the weights
@ -5385,6 +5384,10 @@ class PoolerStartLogits(nn.Module):
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, 1)
+        logger.warning_once(
+            "[DEPRECATION WARNING] `PoolerStartLogits` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMPoolerStartLogits`."
+        )

    def forward(
        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
@ -5427,6 +5430,10 @@ class PoolerEndLogits(nn.Module):
        self.activation = nn.Tanh()
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dense_1 = nn.Linear(config.hidden_size, 1)
+        logger.warning_once(
+            "[DEPRECATION WARNING] `PoolerEndLogits` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMPoolerEndLogits`."
+        )

    def forward(
        self,
@ -5494,6 +5501,10 @@ class PoolerAnswerClass(nn.Module):
        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.activation = nn.Tanh()
        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+        logger.warning_once(
+            "[DEPRECATION WARNING] `PoolerAnswerClass` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMPoolerAnswerClass`."
+        )

    def forward(
        self,
@ -5575,6 +5586,12 @@ class SquadHeadOutput(ModelOutput):
    end_top_index: Optional[torch.LongTensor] = None
    cls_logits: Optional[torch.FloatTensor] = None

+    def __post_init__(self):
+        logger.warning_once(
+            "[DEPRECATION WARNING] `SquadHeadOutput` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMSquadHeadOutput`."
+        )
+

 class SQuADHead(nn.Module):
    r"""
@ -5595,6 +5612,11 @@ class SQuADHead(nn.Module):
        self.end_logits = PoolerEndLogits(config)
        self.answer_class = PoolerAnswerClass(config)

+        logger.warning_once(
+            "[DEPRECATION WARNING] `SQuADHead` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMSQuADHead`."
+        )
+
    @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
    def forward(
        self,
@ -5748,6 +5770,11 @@ class SequenceSummary(nn.Module):
        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)

+        logger.warning_once(
+            "[DEPRECATION WARNING] `SequenceSummary` is deprecated and will be removed in v4.53. "
+            "Please use model-specific class, e.g. `XLMSequenceSummary`."
+        )
+
    def forward(
        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
    ) -> torch.FloatTensor:
@ -5834,7 +5861,17 @@ def expand_device_map(device_map, param_names):
    return new_device_map


-def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, factor=2):
+def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
+    """Check if the device is an accelerator. We need to function, as device_map can be "disk" as well, which is not
+    a proper `torch.device`.
+    """
+    if device == "disk":
+        return False
+    else:
+        return torch.device(device).type not in ["meta", "cpu"]
+
+
+def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, hf_quantizer: Optional[HfQuantizer]):
    """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
    device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
    the model, which is actually the loading speed botteneck.
@ -5853,9 +5890,11 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
    - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
    However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
    """
-    # Remove disk and cpu devices, and cast to proper torch.device
+    factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor()
+
+    # Remove disk, cpu and meta devices, and cast to proper torch.device
    accelerator_device_map = {
-        param: torch.device(device) for param, device in expanded_device_map.items() if device not in ["cpu", "disk"]
+        param: torch.device(device) for param, device in expanded_device_map.items() if is_accelerator_device(device)
    }
    if not len(accelerator_device_map):
        return
@ -5889,7 +5928,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
            # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
            # Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much
            # if using e.g. 90% of device size, while a 140GiB device would allocate too little
-            byte_count = min(byte_count, int(device_memory - 1.2 * 1024**3))
+            byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3)))
        # Allocate memory
        _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)

--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -144,6 +144,7 @@ if TYPE_CHECKING:
    from .informer import *
    from .instructblip import *
    from .instructblipvideo import *
+    from .internvl import *
    from .jamba import *
    from .janus import *
    from .jetmoe import *
--- a/src/transformers/models/aria/configuration_aria.py
+++ b/src/transformers/models/aria/configuration_aria.py
@ -258,6 +258,9 @@ class AriaConfig(PretrainedConfig):
    """

    model_type = "aria"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
    sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}

    def __init__(
--- a/src/transformers/models/aria/convert_aria_weights_to_hf.py
+++ b/src/transformers/models/aria/convert_aria_weights_to_hf.py
@ -106,7 +106,7 @@ def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, ol
    config.vision_config.hidden_size = 1152
    config.vision_config.attention_heads = 16
    config.pad_token_id = 2
-    config.image_token_index = 9
+    config.image_token_id = 9
    config.intermediate_size = config.moe_intermediate_size
    config.auto_map = {
        "AutoConfig": "modeling_aria.AriaConfig",
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@ -19,7 +19,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from functools import partial
 from typing import Callable, List, Optional, Tuple, Union

 from ...activations import ACT2FN
@ -28,6 +27,7 @@ from ...generation import GenerationMixin
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -590,7 +590,7 @@ class AriaTextAttention(nn.Module):
        return attn_output, attn_weights


-class AriaTextDecoderLayer(nn.Module):
+class AriaTextDecoderLayer(GradientCheckpointingLayer):
    """
    Aria Text Decoder Layer.

@ -940,30 +940,17 @@ class AriaTextModel(AriaTextPreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

@ -1507,11 +1494,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
        if pixel_values is not None and inputs_embeds.shape[1] != 1:
            if input_ids is None:
                special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                    torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
+                    torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
                )
                n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
            else:
-                image_embeds = input_ids == self.config.image_token_index
+                image_embeds = input_ids == self.config.image_token_id
                special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
                n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
            image_features = self.get_image_features(
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@ -266,6 +266,9 @@ class AriaConfig(PretrainedConfig):
    """

    model_type = "aria"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
    sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig}

    def __init__(
@ -1546,11 +1549,11 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
        if pixel_values is not None and inputs_embeds.shape[1] != 1:
            if input_ids is None:
                special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                    torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
+                    torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
                )
                n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
            else:
-                image_embeds = input_ids == self.config.image_token_index
+                image_embeds = input_ids == self.config.image_token_id
                special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
                n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
            image_features = self.get_image_features(
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -164,6 +164,8 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("informer", "InformerConfig"),
        ("instructblip", "InstructBlipConfig"),
        ("instructblipvideo", "InstructBlipVideoConfig"),
+        ("internvl", "InternVLConfig"),
+        ("internvl_vision", "InternVLVisionConfig"),
        ("jamba", "JambaConfig"),
        ("janus", "JanusConfig"),
        ("jetmoe", "JetMoeConfig"),
@ -523,6 +525,8 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("informer", "Informer"),
        ("instructblip", "InstructBLIP"),
        ("instructblipvideo", "InstructBlipVideo"),
+        ("internvl", "InternVL"),
+        ("internvl_vision", "InternVLVision"),
        ("jamba", "Jamba"),
        ("janus", "Janus"),
        ("jetmoe", "JetMoe"),
@ -802,6 +806,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
        ("chinese_clip_vision_model", "chinese_clip"),
        ("rt_detr_resnet", "rt_detr"),
        ("granitevision", "llava_next"),
+        ("internvl_vision", "internvl"),
        ("qwen2_5_vl_text", "qwen2_5_vl"),
        ("qwen2_vl_text", "qwen2_vl"),
        ("sam_vision_model", "sam"),
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -329,7 +329,7 @@ def _warning_fast_image_processor_available(fast_class):
    )


-@requires(backends=("vision", "torchvision"))
+@requires(backends=("vision",))
 class AutoImageProcessor:
    r"""
    This is a generic image processor class that will be instantiated as one of the image processor classes of the
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -153,6 +153,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("ijepa", "IJepaModel"),
        ("imagegpt", "ImageGPTModel"),
        ("informer", "InformerModel"),
+        ("internvl_vision", "InternVLVisionModel"),
        ("jamba", "JambaModel"),
        ("janus", "JanusModel"),
        ("jetmoe", "JetMoeModel"),
@ -865,6 +866,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
        ("idefics2", "Idefics2ForConditionalGeneration"),
        ("idefics3", "Idefics3ForConditionalGeneration"),
        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("internvl", "InternVLForConditionalGeneration"),
        ("janus", "JanusForConditionalGeneration"),
        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
        ("llama4", "Llama4ForConditionalGeneration"),
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@ -77,6 +77,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("idefics3", "Idefics3Processor"),
        ("instructblip", "InstructBlipProcessor"),
        ("instructblipvideo", "InstructBlipVideoProcessor"),
+        ("internvl", "InternVLProcessor"),
        ("janus", "JanusProcessor"),
        ("kosmos-2", "Kosmos2Processor"),
        ("layoutlmv2", "LayoutLMv2Processor"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -265,6 +265,7 @@ else:
            ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
            ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
            ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("internvl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
            (
                "jamba",
                (
--- a/src/transformers/models/aya_vision/configuration_aya_vision.py
+++ b/src/transformers/models/aya_vision/configuration_aya_vision.py
@ -52,6 +52,9 @@ class AyaVisionConfig(PretrainedConfig):
    """

    model_type = "aya_vision"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}

    def __init__(
--- a/src/transformers/models/aya_vision/modeling_aya_vision.py
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@ -444,10 +444,10 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
                image_sizes=image_sizes,
            )

-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
-                n_image_tokens = (input_ids == self.config.image_token_index).sum()
+                n_image_tokens = (input_ids == self.config.image_token_id).sum()
                n_image_features = image_features.shape[0] * image_features.shape[1]
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@ -15,6 +15,7 @@
 """PyTorch BARK model."""

 import math
+import warnings
 from typing import Dict, Optional, Tuple, Union

 import numpy as np
@ -36,6 +37,7 @@ from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_accelerate_available,
+    is_torch_accelerator_available,
    logging,
 )
 from ..auto import AutoModel
@ -1598,26 +1600,45 @@ class BarkModel(BarkPreTrainedModel):
            ):
                return torch.device(module._hf_hook.execution_device)

-    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
+    def enable_cpu_offload(
+        self,
+        accelerator_id: Optional[int] = 0,
+        **kwargs,
+    ):
        r"""
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
-        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
-        the next sub-model runs.
+        method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs.

        Args:
-            gpu_id (`int`, *optional*, defaults to 0):
-                GPU id on which the sub-models will be loaded and offloaded.
+            accelerator_id (`int`, *optional*, defaults to 0):
+                accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated.
+            kwargs (`dict`, *optional*):
+                additional keyword arguments:
+                    `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded.
        """
        if is_accelerate_available():
            from accelerate import cpu_offload_with_hook
        else:
            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")

-        device = torch.device(f"cuda:{gpu_id}")
+        gpu_id = kwargs.get("gpu_id", 0)

+        if gpu_id != 0:
+            warnings.warn(
+                "The argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.",
+                FutureWarning,
+            )
+            accelerator_id = gpu_id
+
+        device_type = "cuda"
+        if is_torch_accelerator_available():
+            device_type = torch.accelerator.current_accelerator().type
+        device = torch.device(f"{device_type}:{accelerator_id}")
+
+        torch_accelerator_module = getattr(torch, device_type)
        if self.device.type != "cpu":
            self.to("cpu")
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+            torch_accelerator_module.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)

        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@ -273,6 +273,9 @@ class Blip2Config(PretrainedConfig):
    ```"""

    model_type = "blip-2"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
    sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig}

    def __init__(
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@ -2283,10 +2283,10 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)

-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # if the model already has "image_token_id" then the input is expanded to account for image embeds
        # otherwise we expand manually by concating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        if getattr(self.config, "image_token_id", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
            language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
        else:
@ -2406,8 +2406,8 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):

        if input_ids is None:
            start_tokens = [self.config.text_config.bos_token_id]
-            if getattr(self.config, "image_token_index", None) is not None:
-                start_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens
+            if getattr(self.config, "image_token_id", None) is not None:
+                start_tokens = [self.config.image_token_id] * self.config.num_query_tokens + start_tokens
            input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device)
            input_ids = input_ids.repeat(batch_size, 1)

@ -2415,10 +2415,10 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)

-        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # if the model already has "image_token_id" then the input is expanded to account for image embeds
        # otherwise we expand manually by concatenating
-        if getattr(self.config, "image_token_index", None) is not None:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+        if getattr(self.config, "image_token_id", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
        else:
            logger.warning_once(
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@ -18,14 +18,14 @@
 import copy
 import math
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Union

 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss

-from ...activations import ACT2FN
+from ...activations import ACT2FN, get_activation
 from ...generation import GenerationConfig, GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
@ -34,7 +34,7 @@ from ...modeling_outputs import (
    BaseModelOutputWithPooling,
    CausalLMOutputWithCrossAttentions,
 )
-from ...modeling_utils import PreTrainedModel, SequenceSummary
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import Conv1D, isin_mps_friendly
 from ...utils import (
    ModelOutput,
@ -499,6 +499,106 @@ class ClvpEncoderLayer(nn.Module):
        return outputs


+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Clvp
+class ClvpSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`ClvpConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: ClvpConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
 # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->ClvpDecoderMLP
 class ClvpDecoderMLP(nn.Module):
    def __init__(self, intermediate_size, config):
@ -884,7 +984,7 @@ class ClvpEncoder(ClvpPreTrainedModel):
        self.rotary_pos_emb = ClvpRotaryPositionalEmbedding(config) if config.use_rotary_embedding else None
        self.layers = nn.ModuleList([ClvpEncoderLayer(config) for _ in range(config.num_hidden_layers)])

-        self.sequence_summary = SequenceSummary(config)
+        self.sequence_summary = ClvpSequenceSummary(config)
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        self.projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@ -27,7 +27,6 @@
 # This file is based on the LLama model definition file in transformers


-from functools import partial
 from typing import Callable, List, Optional, Tuple, Union

 import torch
@ -38,6 +37,7 @@ from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -301,7 +301,7 @@ class CohereAttention(nn.Module):
        return attn_output, attn_weights


-class CohereDecoderLayer(nn.Module):
+class CohereDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: CohereConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -589,30 +589,17 @@ class CohereModel(CoherePreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

--- a/src/transformers/models/cohere/modular_cohere.py
+++ b/src/transformers/models/cohere/modular_cohere.py
@ -30,6 +30,7 @@ from torch import nn

 from ...cache_utils import Cache
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
@ -209,7 +210,7 @@ class CohereAttention(LlamaAttention):
        return attn_output, attn_weights


-class CohereDecoderLayer(nn.Module):
+class CohereDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: CohereConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
--- a/src/transformers/models/cohere2/modeling_cohere2.py
+++ b/src/transformers/models/cohere2/modeling_cohere2.py
@ -19,7 +19,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from functools import partial
 from typing import Callable, List, Optional, Tuple, Union

 import torch
@ -29,6 +28,7 @@ from ...activations import ACT2FN
 from ...cache_utils import Cache, HybridCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -42,6 +42,7 @@ from ...utils import (
    logging,
    replace_return_docstrings,
 )
+from ...utils.deprecation import deprecate_kwarg
 from .configuration_cohere2 import Cohere2Config


@ -289,7 +290,7 @@ class Cohere2MLP(nn.Module):
        return down_proj


-class Cohere2DecoderLayer(nn.Module):
+class Cohere2DecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: Cohere2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -300,6 +301,7 @@ class Cohere2DecoderLayer(nn.Module):
        self.is_sliding = (layer_idx + 1) % self.config.sliding_window_pattern != 0
        self.sliding_window = config.sliding_window

+    @deprecate_kwarg("last_cache_position", version="4.53.0")
    def forward(
        self,
        hidden_states: torch.Tensor,
@ -309,7 +311,6 @@ class Cohere2DecoderLayer(nn.Module):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        last_cache_position: int = 0,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
@ -330,7 +331,6 @@ class Cohere2DecoderLayer(nn.Module):
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
-            last_cache_position (`int`): equivalent to `cache_position[-1]` but allow indexing without breaking dynamo tracing
        """

        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
@ -349,11 +349,16 @@ class Cohere2DecoderLayer(nn.Module):
                )
                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
                # In case we are beyond the sliding window, we need to correctly offset the mask slicing
-                # `last_cache_position` is equivalent to `cache_position[-1]` but without breaking dynamo
-                offset = last_cache_position - effective_seq_len
+                offset = cache_position[-1] - effective_seq_len + 1
                # Should only be used when beyond the sliding window (i.e. offset > 0)
                offset = max(0, offset)
-                attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]
+                # equivalent to: `attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]`,
+                # but without data-dependent slicing (i.e. torch.compile friendly)
+                mask_indexes = torch.arange(
+                    min(effective_seq_len, attention_mask.shape[-1]), device=attention_mask.device
+                )
+                mask_indexes += offset
+                attention_mask = attention_mask[:, :, :, mask_indexes]

        residual = hidden_states

@ -539,6 +544,7 @@ class Cohere2Model(Cohere2PreTrainedModel):

    @can_return_tuple
    @add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
+    @deprecate_kwarg("last_cache_position", version="4.53.0")
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
@ -550,7 +556,6 @@ class Cohere2Model(Cohere2PreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        last_cache_position: Optional[int] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> BaseModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -590,16 +595,6 @@ class Cohere2Model(Cohere2PreTrainedModel):
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

-        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
-        # (retrieving the same value from `cache_position` later on would crash dynamo)
-        if last_cache_position is None:
-            last_cache_position = 0
-            if attention_mask is not None:
-                # In case a 4d mask is passed directly without using `generate`, we have to rely on cache_position
-                # It will break dynamo tracing but there are no way around it (and it should never happen in practice)
-                last_cache_position = (
-                    attention_mask.shape[-1] if attention_mask.dim() == 2 else cache_position[-1].item()
-                )
        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )
@ -617,30 +612,16 @@ class Cohere2Model(Cohere2PreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    position_embeddings,
-                    causal_mask,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    last_cache_position,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    position_embeddings=position_embeddings,
-                    attention_mask=causal_mask,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    last_cache_position=last_cache_position,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

@ -667,7 +648,7 @@ class Cohere2Model(Cohere2PreTrainedModel):
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: HybridCache,
-        output_attentions: bool,
+        output_attentions: bool = False,
    ):
        # Flash Attention currently doesn't support static cache but Cohere2 work only with static cache.
        # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
@ -928,10 +909,6 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin):
            # The clone here is for the same reason as for `position_ids`.
            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}

-        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
-        # (retrieving the same value from `cache_position` later on would crash dynamo)
-        model_inputs["last_cache_position"] = attention_mask.shape[-1] if attention_mask is not None else 0
-
        if (
            isinstance(past_key_values, HybridCache)
            and attention_mask.ndim == 2
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from functools import partial
 from typing import Callable, Optional, Tuple

 import torch
@ -23,15 +22,12 @@ import torch.utils.checkpoint
 from ...cache_utils import Cache, HybridCache
 from ...configuration_utils import PretrainedConfig
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-)
+from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import rope_config_validation
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import (
-    logging,
-)
+from ...utils import add_start_docstrings_to_model_forward, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
 from ..cohere.modeling_cohere import (
    CohereAttention,
    CohereDecoderLayer,
@ -45,6 +41,9 @@ from ..cohere.modeling_cohere import (
 from ..gemma2.modeling_gemma2 import Gemma2Model


+COHERE2_INPUTS_DOCSTRING = None  # Will be picked up by modular
+
+
 logger = logging.get_logger(__name__)


@ -351,6 +350,7 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
        self.is_sliding = (layer_idx + 1) % self.config.sliding_window_pattern != 0
        self.sliding_window = config.sliding_window

+    @deprecate_kwarg("last_cache_position", version="4.53.0")
    def forward(
        self,
        hidden_states: torch.Tensor,
@ -360,7 +360,6 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        last_cache_position: int = 0,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
@ -381,7 +380,6 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
-            last_cache_position (`int`): equivalent to `cache_position[-1]` but allow indexing without breaking dynamo tracing
        """

        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
@ -400,11 +398,16 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
                )
                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
                # In case we are beyond the sliding window, we need to correctly offset the mask slicing
-                # `last_cache_position` is equivalent to `cache_position[-1]` but without breaking dynamo
-                offset = last_cache_position - effective_seq_len
+                offset = cache_position[-1] - effective_seq_len + 1
                # Should only be used when beyond the sliding window (i.e. offset > 0)
                offset = max(0, offset)
-                attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]
+                # equivalent to: `attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]`,
+                # but without data-dependent slicing (i.e. torch.compile friendly)
+                mask_indexes = torch.arange(
+                    min(effective_seq_len, attention_mask.shape[-1]), device=attention_mask.device
+                )
+                mask_indexes += offset
+                attention_mask = attention_mask[:, :, :, mask_indexes]

        residual = hidden_states

@ -452,6 +455,9 @@ class Cohere2Model(Gemma2Model):
        self.norm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
        self.rotary_emb = Cohere2RotaryEmbedding(config=config)

+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
+    @deprecate_kwarg("last_cache_position", version="4.53.0")
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
@ -463,7 +469,6 @@ class Cohere2Model(Gemma2Model):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        last_cache_position: Optional[int] = None,
        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
    ) -> BaseModelOutputWithPast:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -503,16 +508,6 @@ class Cohere2Model(Gemma2Model):
        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

-        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
-        # (retrieving the same value from `cache_position` later on would crash dynamo)
-        if last_cache_position is None:
-            last_cache_position = 0
-            if attention_mask is not None:
-                # In case a 4d mask is passed directly without using `generate`, we have to rely on cache_position
-                # It will break dynamo tracing but there are no way around it (and it should never happen in practice)
-                last_cache_position = (
-                    attention_mask.shape[-1] if attention_mask.dim() == 2 else cache_position[-1].item()
-                )
        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )
@ -530,30 +525,16 @@ class Cohere2Model(Gemma2Model):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    position_embeddings,
-                    causal_mask,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    last_cache_position,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    position_embeddings=position_embeddings,
-                    attention_mask=causal_mask,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    last_cache_position=last_cache_position,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

@ -625,10 +606,6 @@ class Cohere2ForCausalLM(CohereForCausalLM):
            # The clone here is for the same reason as for `position_ids`.
            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}

-        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
-        # (retrieving the same value from `cache_position` later on would crash dynamo)
-        model_inputs["last_cache_position"] = attention_mask.shape[-1] if attention_mask is not None else 0
-
        if (
            isinstance(past_key_values, HybridCache)
            and attention_mask.ndim == 2
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@ -17,7 +17,7 @@
 import math
 import os
 from operator import attrgetter
-from typing import Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Union

 import torch
 import torch.utils.checkpoint
@ -33,7 +33,7 @@ from ...modeling_outputs import (
    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel, SequenceSummary
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_convbert import ConvBertConfig
@ -683,6 +683,106 @@ class ConvBertPredictionHeadTransform(nn.Module):
        return hidden_states


+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->ConvBert
+class ConvBertSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`ConvBertConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
 CONVBERT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
@ -1077,7 +1177,7 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
        super().__init__(config)

        self.convbert = ConvBertModel(config)
-        self.sequence_summary = SequenceSummary(config)
+        self.sequence_summary = ConvBertSequenceSummary(config)
        self.classifier = nn.Linear(config.hidden_size, 1)

        # Initialize weights and apply final processing
--- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
+++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
@ -5,7 +5,6 @@
 #                          modular_deepseek_v3.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
-from functools import partial
 from typing import Callable, Optional, Tuple, Union

 import torch
@ -18,6 +17,7 @@ from ...generation import GenerationMixin
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -454,7 +454,7 @@ class DeepseekV3Attention(nn.Module):
        return attn_output, attn_weights


-class DeepseekV3DecoderLayer(nn.Module):
+class DeepseekV3DecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: DeepseekV3Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
@ -734,30 +734,17 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    partial(decoder_layer.__call__, **flash_attn_kwargs),
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )

            hidden_states = layer_outputs[0]

--- a/Show More
+++ b/Show More