Merge branch 'main' into integrate_xlstm_clean

2025-07-31 02:02:21 +06:00 · 2025-07-01 15:01:30 +02:00 · 2025-07-01 15:01:30 +02:00 · 1bde78a312
commit 1bde78a312
parent 7a2983f2f6 1283877571
1781 changed files with 82240 additions and 43848 deletions
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,7 +41,7 @@ jobs:
  check_new_failures:
    name: " "
    runs-on:
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -12,8 +12,8 @@ on:
      slice_id:
        required: true
        type: number
-      runner:
-        required: true
+      runner_map:
+        required: false
        type: string
      docker:
        required: true
@ -45,7 +45,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on:
-      group: '${{ inputs.machine_type }}'
+      group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -107,9 +107,9 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -1,128 +0,0 @@
-name: model jobs
-
-on:
-  workflow_call:
-    inputs:
-      folder_slices:
-        required: true
-        type: string
-      machine_type:
-        required: true
-        type: string
-      slice_id:
-        required: true
-        type: number
-      runner:
-        required: true
-        type: string
-      docker:
-        required: true
-        type: string
-
-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes
-  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
-  # This token is created under the bot `hf-transformers-bot`.
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
-  CUDA_VISIBLE_DEVICES: 0,1
-
-jobs:
-  run_models_gpu:
-    name: " "
-    strategy:
-      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
-    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
-    container:
-      image: ${{ inputs.docker }}
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ inputs.folder_slices }}"
-          echo "${{ matrix.folders }}"
-          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
-
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: Update / Install some packages (for Past CI)
-        if: ${{ contains(inputs.docker, '-past-') }}
-        working-directory: /transformers
-        run: |
-          python3 -m pip install -U datasets
-
-      - name: Update / Install some packages (for Past CI)
-        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
-        working-directory: /transformers
-        run: |
-          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Run test
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/model_jobs_intel_gaudi.yml
+++ b/.github/workflows/model_jobs_intel_gaudi.yml
@ -0,0 +1,121 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      report_name_prefix:
+        required: false
+        default: run_models_gpu
+        type: string
+
+env:
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 8
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on:
+      group: ${{ inputs.runner }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ inputs.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all tests on Gaudi
+        run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,9 +239,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,9 +338,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -31,7 +31,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -131,7 +131,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -169,9 +169,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -244,7 +244,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -282,9 +282,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -357,7 +357,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -395,9 +395,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -467,7 +467,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -505,9 +505,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -22,7 +22,7 @@ on:
        default: ""


-# Used for `push` to easily modiffy the target workflow runs to compare against
+# Used for `push` to easily modify the target workflow runs to compare against
 env:
    prev_workflow_run_id: ""
    other_workflow_run_id: ""
@ -51,7 +51,6 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-models"
-      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -63,7 +62,6 @@ jobs:
    with:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      runner: daily-ci
      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -75,7 +73,6 @@ jobs:
    with:
      job: run_examples_gpu
      slack_report_channel: "#transformers-ci-daily-examples"
-      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -87,7 +84,6 @@ jobs:
    with:
      job: run_trainer_and_fsdp_gpu
      slack_report_channel: "#transformers-ci-daily-training"
-      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
@ -99,7 +95,6 @@ jobs:
    with:
      job: run_torch_cuda_extensions_gpu
      slack_report_channel: "#transformers-ci-daily-training"
-      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
      working-directory-prefix: /workspace
@ -112,7 +107,6 @@ jobs:
    with:
      job: run_quantization_torch_gpu
      slack_report_channel: "#transformers-ci-daily-quantization"
-      runner: daily-ci
      docker: huggingface/transformers-quantization-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -0,0 +1,345 @@
+name: Self-hosted runner (scheduled-intel-gaudi)
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      runner_scale_set:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string
+
+env:
+  NUM_SLICES: 2
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  setup:
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+    name: Setup
+    runs-on: ubuntu-latest
+    outputs:
+      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - id: set-matrix
+        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+        name: Identify models to test
+        working-directory: tests
+        run: |
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
+            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
+          fi
+
+      - id: set-matrix-quantization
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        name: Identify quantization method to test
+        working-directory: tests
+        run: |
+          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+    if: ${{ inputs.job == 'run_models_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_models_gpu
+
+    secrets: inherit
+
+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+
+    secrets: inherit
+
+  run_pipelines_gpu:
+    if: ${{ inputs.job == 'run_pipelines_gpu' }}
+    name: Pipelines
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on Intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run examples tests on Intel Gaudi
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
+
+  run_deepspeed_gpu:
+    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
+    name: Intel Gaudi deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all deepspeed tests on intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+
+  send_results:
+    name: Slack Report
+    needs:
+      [
+        setup,
+        run_models_gpu,
+        run_examples_gpu,
+        run_pipelines_gpu,
+        run_deepspeed_gpu,
+        run_trainer_and_fsdp_gpu,
+      ]
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+      ci_event: ${{ inputs.ci_event }}
+
+    secrets: inherit
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -0,0 +1,67 @@
+name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
+
+on:
+  repository_dispatch:
+  workflow_dispatch:
+  schedule:
+    - cron: "17 2 * * *"
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_models_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  pipeline-ci:
+    name: Pipeline CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_pipelines_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_examples_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_deepspeed_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -15,9 +15,6 @@ on:
      slack_report_channel:
        required: true
        type: string
-      runner:
-        required: true
-        type: string
      docker:
        required: true
        type: string
@ -53,7 +50,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -62,6 +59,7 @@ jobs:
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+      runner_map: ${{ steps.set-matrix.outputs.runner_map }}
      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
    steps:
      - name: Update clone
@ -88,6 +86,7 @@ jobs:
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+            echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -111,14 +110,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner: ${{ inputs.runner }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
    secrets: inherit

@ -129,14 +128,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
@ -147,7 +145,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -181,9 +179,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -215,7 +213,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -249,9 +247,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -284,7 +282,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -346,9 +344,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -383,7 +381,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -426,9 +424,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/18
+++ b/18
@ -8,13 +8,19 @@ check_dirs := examples tests src utils
 exclude_folders :=  ""

 modified_only_fixup:
-	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
-	@if test -n "$(modified_py_files)"; then \
-		echo "Checking/fixing $(modified_py_files)"; \
-		ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
-		ruff format $(modified_py_files) --exclude $(exclude_folders);\
+	@current_branch=$$(git branch --show-current); \
+	if [ "$$current_branch" = "main" ]; then \
+		echo "On main branch, running 'style' target instead..."; \
+		$(MAKE) style; \
 	else \
-		echo "No library .py files were modified"; \
+		modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
+		if [ -n "$$modified_py_files" ]; then \
+			echo "Checking/fixing files: $${modified_py_files}"; \
+			ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
+			ruff format $${modified_py_files} --exclude $(exclude_folders); \
+		else \
+			echo "No library .py files were modified"; \
+		fi; \
 	fi

 # Update src/transformers/dependency_versions_table.py
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation

 ## [dalle-flow](https://github.com/jina-ai/dalle-flow)

-DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
 The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.

 Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge

 ## [underthesea](https://github.com/undertheseanlp/underthesea)

-[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.

 Keywords: Vietnamese, NLP

--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -28,7 +28,7 @@ class MetricsRecorder:
        self.commit_id = commit_id
        self.commit_msg = commit_msg

-    def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
+    def initialise_benchmark(self, metadata: dict[str, str]) -> int:
        """
        Creates a new benchmark, returns the benchmark id
        """
@ -55,7 +55,7 @@ class MetricsRecorder:
            f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
        )

-    def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
+    def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
        with self.conn.cursor() as cur:
            cur.execute(
                """
@ -85,7 +85,7 @@ handler.setFormatter(formatter)
 logger.addHandler(handler)


-def parse_arguments() -> Tuple[str, str, str, str]:
+def parse_arguments() -> tuple[str, str, str, str]:
    """
    Parse command line arguments for the benchmarking CLI.
    """
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+
 RUN apt update && \
    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
    apt clean && \
@ -20,6 +23,7 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

+RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]

 RUN python3 -m pip uninstall -y tensorflow flax
--- a/docker/transformers-pytorch-xpu/Dockerfile
+++ b/docker/transformers-pytorch-xpu/Dockerfile
@ -0,0 +1,93 @@
+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
+LABEL maintainer="Hugging Face"
+
+SHELL ["/bin/bash", "-c"]
+
+ARG PYTHON_VER=3.11
+ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get remove -y python3.10 && apt-get autoremove -y
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
+    ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && \
+    apt-get -y install \
+        apt-utils \
+        build-essential \
+        ca-certificates \
+        clinfo \
+        curl \
+        git \
+        git-lfs \
+        vim \
+        numactl \
+        gnupg2 \
+        gpg-agent \
+        zlib1g-dev \
+        rsync \
+        sudo \
+        libnl-genl-3-200 \
+        xpu-smi \
+        unzip \
+        ffmpeg \
+        tesseract-ocr \
+        espeak-ng \
+        wget \
+        ncurses-term && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+RUN apt-get update && \
+    apt-get install -y \
+        linux-headers-$(uname -r) \
+        linux-modules-extra-$(uname -r) \
+        flex bison \
+        intel-fw-gpu intel-i915-dkms xpu-smi \
+        intel-opencl-icd libze-intel-gpu1 libze1 \
+        intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+        libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
+        libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
+    apt-get clean && \
+    rm -rf  /var/lib/apt/lists/*
+
+RUN pip install --upgrade pip
+RUN pip install triton==3.3.0
+
+RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
+
+RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
+RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
+RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
+RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
+
+RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
+
+# install bitsandbytes
+RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
+
+ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
+ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
+ENV CCL_ROOT=/usr/local
+ENV CCL_ATL_TRANSPORT=ofi
+ENV I_MPI_ROOT=/usr/local
+ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
+ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
+ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
+
+RUN touch /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+RUN echo "#!/bin/bash" >> /entrypoint.sh
+RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -93,6 +93,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels

+# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
+RUN python3 -m pip uninstall -y flash-attn
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/README.md
+++ b/docs/README.md
@ -278,7 +278,7 @@ Here's an example of a single value return:

 ```python
    Returns:
-        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+        `list[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```

 Here's an example of a tuple return, comprising several objects:
--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@ -30,7 +30,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/de/testing.md
+++ b/docs/source/de/testing.md
@ -473,13 +473,6 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
 def test_example_with_multi_gpu():
 ```

-Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
-
-```python no-style
-@require_tf
-def test_tf_thing_with_tensorflow():
-```
-
 Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
 wie Sie ihn einrichten können:

@ -1204,9 +1197,6 @@ if torch.cuda.is_available():
 import numpy as np

 np.random.seed(seed)
-
-# tf RNG
-tf.random.set_seed(seed)
 ```

 ### Tests debuggen
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -17,12 +17,12 @@
      title: Customizing model components
    - local: model_sharing
      title: Sharing
-    - local: add_new_model
-      title: Adding a new model to Transformers
    - local: modular_transformers
-      title: Modular Transformers
+      title: Contributing a new model to Transformers
+    - local: add_new_model
+      title: Legacy model contribution
    - local: auto_docstring
-      title: Document your models
+      title: Documenting a model
    - local: attention_interface
      title: Customizing attention function
    title: Models
@ -97,7 +97,7 @@
    - local: perf_infer_gpu_one
      title: GPU
    - local: perf_infer_gpu_multi
-      title: Distributed GPU inference
+      title: Distributed inference
    - local: perf_infer_cpu
      title: CPU
    - local: tf_xla
@ -363,6 +363,8 @@
    - sections:
      - local: model_doc/albert
        title: ALBERT
+      - local: model_doc/arcee
+        title: Arcee
      - local: model_doc/bamba
        title: Bamba
      - local: model_doc/bart
@ -431,6 +433,8 @@
        title: DiffLlama
      - local: model_doc/distilbert
        title: DistilBERT
+      - local: model_doc/dots1
+        title: dots1
      - local: model_doc/dpr
        title: DPR
      - local: model_doc/electra
@ -653,6 +657,8 @@
        title: SwitchTransformers
      - local: model_doc/t5
        title: T5
+      - local: model_doc/t5gemma
+        title: T5Gemma
      - local: model_doc/t5v1.1
        title: T5v1.1
      - local: model_doc/tapex
@ -733,6 +739,8 @@
        title: EfficientFormer
      - local: model_doc/efficientnet
        title: EfficientNet
+      - local: model_doc/eomt
+        title: EoMT
      - local: model_doc/focalnet
        title: FocalNet
      - local: model_doc/glpn
@ -745,6 +753,8 @@
        title: ImageGPT
      - local: model_doc/levit
        title: LeViT
+      - local: model_doc/lightglue
+        title: LightGlue
      - local: model_doc/mask2former
        title: Mask2Former
      - local: model_doc/maskformer
@ -833,6 +843,8 @@
        title: CSM
      - local: model_doc/dac
        title: dac
+      - local: model_doc/dia
+        title: Dia
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/fastspeech2_conformer
@ -841,6 +853,8 @@
        title: GraniteSpeech
      - local: model_doc/hubert
        title: Hubert
+      - local: model_doc/kyutai_speech_to_text
+        title: Kyutai Speech-To-Text
      - local: model_doc/mctct
        title: MCTCT
      - local: model_doc/mimi
@ -949,8 +963,12 @@
        title: FLAVA
      - local: model_doc/gemma3
        title: Gemma3
+      - local: model_doc/gemma3n
+        title: Gemma3n
      - local: model_doc/git
        title: GIT
+      - local: model_doc/glm4v
+        title: glm4v
      - local: model_doc/got_ocr2
        title: GOT-OCR2
      - local: model_doc/granitevision
@ -1043,6 +1061,8 @@
        title: SigLIP
      - local: model_doc/siglip2
        title: SigLIP2
+      - local: model_doc/smollm3
+        title: SmolLM3
      - local: model_doc/smolvlm
        title: SmolVLM
      - local: model_doc/speech-encoder-decoder
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -13,7 +13,7 @@ rendered properly in your Markdown viewer.

 -->

-# Adding a new model to Transformers
+# Legacy model contribution

 > [!TIP]
 > Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
@ -571,7 +571,7 @@ The processor should call the appropriate modality-specific processors within it
 def __call__(
    self,
    images: ImageInput = None,
-    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+    text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
    audio=None,
    videos=None,
    **kwargs: Unpack[YourModelProcessorKwargs],
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -92,7 +92,7 @@ def custom_attention(
    a_new_kwargs = None,  # You can now add as many kwargs as you need
    another_new_kwargs = None,  # You can now add as many kwargs as you need
    **kwargs,  # You need to accept **kwargs as models will pass other args
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]
    ...  # do your magic!
    return attn_output, attn_weights  # attn_weights are optional here

--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -14,43 +14,26 @@ rendered properly in your Markdown viewer.

 -->

-# Utilizing the @auto_docstring Decorator
+# Documenting a model

-The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.

---
+This guide describes how to use the `@auto_docstring` decorator and how it works.

-## 📜 How it Works
+## @auto_docstring

-The `@auto_docstring` decorator constructs docstrings by:
-
-1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
-2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
-3.  **Overriding or Adding Arguments Descriptions:**
-    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
-    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-4.  **Adding Classes and Functions Introduction:**
-    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
-    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
-5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
-6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
-7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
-8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
-
-
---
-
-## 🚀 How to Use @auto_docstring
-
-### 1. Importing the Decorator
-Import the decorator into your modeling file:
+Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).

 ```python
 from ...utils import auto_docstring
 ```

-### 2. Applying to Classes
-Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
+
+<hfoptions id="type">
+<hfoption id="classes">
+
+Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.

 ```python
 from transformers.modeling_utils import PreTrainedModel
@ -73,9 +56,7 @@ class MyAwesomeModel(PreTrainedModel):
    # ... other methods
 ```

-#### Advanced Class Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control:
+Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.

 ```python
@auto_docstring(
@ -93,7 +74,7 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-Or:
+You can also choose to only use `custom_intro` and define the custom arguments directly in the class.

 ```python
@auto_docstring(
@ -111,8 +92,10 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-### 3. Applying to Functions (e.g., `forward` method)
-Apply the decorator above method definitions, such as the `forward` method.
+</hfoption>
+<hfoption id="functions">
+
+Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.

 ```python
    @auto_docstring
@ -131,9 +114,10 @@ Apply the decorator above method definitions, such as the `forward` method.
        # ...
 ```

-#### Advanced Function Decoration:
+Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
+
+The `Returns` and `Examples` parts of the docstring can also be manually specified.

-Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:

 ```python
 MODEL_COMMON_CUSTOM_ARGS = r"""
@ -180,100 +164,117 @@ class MyModel(PreTrainedModel):
        # ...
 ```

---
+</hfoption>
+</hfoptions>

-### ✍️ Documenting Arguments: Approach & Priority
+## Documenting arguments

-1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
-    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+There are some rules for documenting different types of arguments and they're listed below.
+
+- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.
+
+    If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
+
+
+- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
+
+    ```py
+    argument_name (`type`, *optional*, defaults to `X`):
+        Description of the argument.
+        Explain its purpose, expected shape/type if complex, and default behavior.
+        This can span multiple lines.
+    ```

-2.  **New or Custom Arguments:**
-    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
-    * **Format:**
-        ```
-        argument_name (`type`, *optional*, defaults to `X`):
-            Description of the argument.
-            Explain its purpose, expected shape/type if complex, and default behavior.
-            This can span multiple lines.
-        ```
    * Include `type` in backticks.
-    * Add "*optional*" if the argument is not required (has a default value).
-    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+    * Add *optional* if the argument is not required or has a default value.
+    * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.

-3.  **Overriding Standard Arguments:**
-    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
-    * The `labels` argument is often customized per model and typically requires a specific docstring.
+    These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.

-4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
-    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+    ```py
+    class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=r"""
+        common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+            Description of common_arg_1
+        """
+    )
+    ```

---
+## Checking the docstrings

-### Usage with [modular files](./modular_transformers)
+Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.

-When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+* Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
+* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
+* Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
+* Ensures docstrings are formatted according to the expected docstring style.

- **For standalone models in modular files:**
-  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
-
- **For models inheriting from other library models:**
-  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
-  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
-
-  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-
-**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
-
---
-
-## ✅ Checking Your Docstrings with `check_auto_docstrings`
-
-The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
-
-#### What it Checks:
-
-* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
-* **Argument Completeness & Consistency:**
-    * Flags arguments in the signature that are not known standard arguments and lack a local description.
-    * Ensures documented arguments exist in the signature. (TODO)
-    * Verifies that types and default values in the docstring match the signature. (TODO)
-* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
-* **Formatting:** Adherence to the expected docstring style.
-
-#### Running the Check Locally:
-
-Run this check locally before committing. The common command is:
+You can run this check locally - before committing - by running the following command.

 ```bash
 make fix-copies
 ```

-Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.

 ```bash
 python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
 ```

-#### Workflow with the Checker:
+## modular_model.py files

-1.  Add `@auto_docstring(...)` to the class or method.
-2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
-3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
-    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
-4.  Manually edit these placeholders with accurate types and descriptions.
-5.  Re-run the check to ensure all issues are resolved.
+When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.

---
+- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
+- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.

-## 🔑 Key Takeaways & Best Practices
+    If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.

-* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
-* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
-* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+> [!WARNING]
+> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+## How it works
+
+The `@auto_docstring` decorator automatically generates docstrings by:
+
+1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
+2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
+3. Adding argument descriptions in one of two ways as shown below.
+
+    | method | description | usage |
+    |---|---|---|
+    | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
+    | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
+
+4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.
+
+    `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
+
+5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
+
+6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
+
+7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
+
+    For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specifiy a custom `Returns` field in a functions docstring.
+
+8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
+
+    Currently only supported for [`FastImageProcessorKwargs`].
+
+## Best practices
+
+Follow the best practices below to help maintain consistent and informative documentation for Transformers!
+
+* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
+* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
+* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
 * Document new or custom arguments clearly.
 * Run `check_docstrings` locally and iteratively.
-
-By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
 import torch
 from transformers import pipeline

-pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
 pipeline(text=messages, max_new_tokens=50, return_full_text=False)
 [{'input_text': [{'role': 'system',
    'content': [{'type': 'text',
@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template(
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
-    video_fps=32,
+    video_fps=16,
    video_load_backend="decord",
 )
 print(processed_chat.keys())
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -27,6 +27,9 @@ This guide shows you how to quickly start chatting with Transformers from the co

 ## transformers CLI

+
+### Interactive chat session
+
 After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
@ -51,6 +54,68 @@ transformers chat -h

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).

+
+### Serving a model and using MCP tools
+
+> [!WARNING]
+> This section is experimental and subject to changes in future versions
+
+Powering the `chat` interface, we have a server that takes user messages and returns completions. The server has a chat completion API compatible with the OpenAI SDK, so you can also quickly experiment with `transformers` models on existing aplications. To launch a server separately, use the `transformers serve` CLI:
+
+```bash
+transformers serve Menlo/Jan-nano
+```
+
+Under the hood, the `chat` CLI launches and uses `transformers serve`. This server is also an MCP client, which can receive information available MCP servers (i.e. tools), massage their information into the model prompt, and prepare calls to these tools when the model commands to do so. Naturally, this requires a model that is trained to use tools.
+
+At the moment, MCP tool usage in `transformers` has the following constraints:
+- `chat` can't handle tools, but the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI can;
+- Only the `qwen` family of models is supported.
+
+The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
+
+> [!TIP]
+> Many Hugging Face Spaces can be used as MCP servers. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
+
+```json
+{
+    "model": "http://localhost:8000",
+    "provider": "local",
+    "servers": [
+        {
+            "type": "sse",
+            "config": {
+                "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
+            }
+        }
+    ]
+}
+```
+
+You can then launch your `tiny-agents` chat interface with the following command.
+
+```bash
+tiny-agents run path/to/your/config.json
+```
+
+If you have a server (from `transformers serve`) running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session:
+
+```bash
+Agent loaded with 1 tools:
+ • flux1_schnell_infer
+»  Generate an image of a cat on the moon
+<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
+
+Tool req_0_tool_call
+[Binary Content: Image image/webp, 57732 bytes]
+The task is complete and the content accessible to the User
+Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
+380576952
+
+I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
+```
+
+
 ## TextGenerationPipeline

 [`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/en/feature_extractors.md
+++ b/docs/source/en/feature_extractors.md
@ -26,6 +26,7 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
 from transformers import AutoFeatureExtractor

 feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
 processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
 processed_sample
 {'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -468,9 +468,17 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
 Follow the recommended practices below to ensure your custom decoding method works as expected.
 - Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
 - Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
- You can add other files in the `custom_generate` folder, and use relative imports.
 - Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.

+Your custom `generate` method can relative import code from the `custom_generate` folder. For example, if you have a `utils.py` file, you can import it like this:
+
+```py
+from .utils import some_function
+```
+
+Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom decoding method.
+
+
 #### requirements.txt

 You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -152,7 +152,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
 | `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
 | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
-| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
+| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |


 ## Pitfalls
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -27,20 +27,13 @@ rendered properly in your Markdown viewer.

 [ALBERT](https://huggingface.co/papers/1909.11942) is designed to address memory limitations of scaling and training of [BERT](./bert). It adds two parameter reduction techniques. The first, factorized embedding parametrization, splits the larger vocabulary embedding matrix into two smaller matrices so you can grow the hidden size without adding a lot more parameters. The second, cross-layer parameter sharing, allows layer to share parameters which keeps the number of learnable parameters lower.

-<<<<<<< HEAD
-=======
-
-<<<<<<< HEAD
 ALBERT was created to address problems like -- GPU/TPU memory limitations, longer training times, and unexpected model degradation in BERT. ALBERT uses two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT:

 - **Factorized embedding parameterization:** The large vocabulary embedding matrix is decomposed into two smaller matrices, reducing memory consumption.
 - **Cross-layer parameter sharing:** Instead of learning separate parameters for each transformer layer, ALBERT shares parameters across layers, further reducing the number of learnable weights.

-ALBERT uses absolute position embeddings (like BERT) so padding is applied at right. Size of embeddings is 128 While BERT uses 768. ALBERT can processes maximum 512 token at a time. 
->>>>>>> 7ba1110083 (Update docs/source/en/model_doc/albert.md
)
+ALBERT uses absolute position embeddings (like BERT) so padding is applied at right. Size of embeddings is 128 While BERT uses 768. ALBERT can processes maximum 512 token at a time.

-=======
->>>>>>> 155b733538 (Update albert.md)
 You can find all the original ALBERT checkpoints under the [ALBERT community](https://huggingface.co/albert) organization.

 > [!TIP]
@ -51,7 +44,7 @@ The example below demonstrates how to predict the `[MASK]` token with [`Pipeline
 <hfoptions id="usage">
 <hfoption id="Pipeline">

-```py 
+```py
 import torch
 from transformers import pipeline

@ -80,7 +73,7 @@ model = AutoModelForMaskedLM.from_pretrained(
 )

 prompt = "Plants create energy through a process known as [MASK]."
-inputs = tokenizer(prompt, return_tensors="pt").to(model.device) 
+inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

 with torch.no_grad():
    outputs = model(**inputs)
@ -103,41 +96,30 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 </hfoptions>

-
 ## Notes

 - Inputs should be padded on the right because BERT uses absolute position embeddings.
 - The embedding size `E` is different from the hidden size `H` because the embeddings are context independent (one embedding vector represents one token) and the hidden states are context dependent (one hidden state represents a sequence of tokens). The embedding matrix is also larger because `V x E` where `V` is the vocabulary size. As a result, it's more logical if `H >> E`. If `E < H`, the model has less parameters.

-
 ## Resources

-
 The resources provided in the following sections consist of a list of official Hugging Face and community (indicated by 🌎) resources to help you get started with AlBERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

-
 <PipelineTag pipeline="text-classification"/>

-
 - [`AlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification).

-
 - [`TFAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification).

 - [`FlaxAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
 - Check the [Text classification task guide](../tasks/sequence_classification) on how to use the model.

-
 <PipelineTag pipeline="token-classification"/>

-
 - [`AlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification).

-
 - [`TFAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).

-
-
 - [`FlaxAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
 - [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
 - Check the [Token classification task guide](../tasks/token_classification) on how to use the model.
@ -163,8 +145,7 @@ The resources provided in the following sections consist of a list of official H
 - [`AlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
 - [`TFAlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).

- Check the  [Multiple choice task guide](../tasks/multiple_choice) on how to use the model.
-
+- Check the [Multiple choice task guide](../tasks/multiple_choice) on how to use the model.

 ## AlbertConfig

@ -172,11 +153,7 @@ The resources provided in the following sections consist of a list of official H

 ## AlbertTokenizer

-[[autodoc]] AlbertTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
+[[autodoc]] AlbertTokenizer - build_inputs_with_special_tokens - get_special_tokens_mask - create_token_type_ids_from_sequences - save_vocabulary

 ## AlbertTokenizerFast

@ -193,23 +170,19 @@ The resources provided in the following sections consist of a list of official H

 ## AlbertModel

-[[autodoc]] AlbertModel
-    - forward
+[[autodoc]] AlbertModel - forward

 ## AlbertForPreTraining

-[[autodoc]] AlbertForPreTraining
-    - forward
+[[autodoc]] AlbertForPreTraining - forward

 ## AlbertForMaskedLM

-[[autodoc]] AlbertForMaskedLM
-    - forward
+[[autodoc]] AlbertForMaskedLM - forward

 ## AlbertForSequenceClassification

-[[autodoc]] AlbertForSequenceClassification
-    - forward
+[[autodoc]] AlbertForSequenceClassification - forward

 ## AlbertForMultipleChoice

@ -217,13 +190,11 @@ The resources provided in the following sections consist of a list of official H

 ## AlbertForTokenClassification

-[[autodoc]] AlbertForTokenClassification
-    - forward
+[[autodoc]] AlbertForTokenClassification - forward

 ## AlbertForQuestionAnswering

-[[autodoc]] AlbertForQuestionAnswering
-    - forward
+[[autodoc]] AlbertForQuestionAnswering - forward

 </pt>

@ -231,78 +202,62 @@ The resources provided in the following sections consist of a list of official H

 ## TFAlbertModel

-[[autodoc]] TFAlbertModel
-    - call
+[[autodoc]] TFAlbertModel - call

 ## TFAlbertForPreTraining

-[[autodoc]] TFAlbertForPreTraining
-    - call
+[[autodoc]] TFAlbertForPreTraining - call

 ## TFAlbertForMaskedLM

-[[autodoc]] TFAlbertForMaskedLM
-    - call
+[[autodoc]] TFAlbertForMaskedLM - call

 ## TFAlbertForSequenceClassification

-[[autodoc]] TFAlbertForSequenceClassification
-    - call
+[[autodoc]] TFAlbertForSequenceClassification - call

 ## TFAlbertForMultipleChoice

-[[autodoc]] TFAlbertForMultipleChoice
-    - call
+[[autodoc]] TFAlbertForMultipleChoice - call

 ## TFAlbertForTokenClassification

-[[autodoc]] TFAlbertForTokenClassification
-    - call
+[[autodoc]] TFAlbertForTokenClassification - call

 ## TFAlbertForQuestionAnswering

-[[autodoc]] TFAlbertForQuestionAnswering
-    - call
+[[autodoc]] TFAlbertForQuestionAnswering - call

 </tf>
 <jax>

 ## FlaxAlbertModel

-[[autodoc]] FlaxAlbertModel
-    - __call__
+[[autodoc]] FlaxAlbertModel - **call**

 ## FlaxAlbertForPreTraining

-[[autodoc]] FlaxAlbertForPreTraining
-    - __call__
+[[autodoc]] FlaxAlbertForPreTraining - **call**

 ## FlaxAlbertForMaskedLM

-[[autodoc]] FlaxAlbertForMaskedLM
-    - __call__
+[[autodoc]] FlaxAlbertForMaskedLM - **call**

 ## FlaxAlbertForSequenceClassification

-[[autodoc]] FlaxAlbertForSequenceClassification
-    - __call__
+[[autodoc]] FlaxAlbertForSequenceClassification - **call**

 ## FlaxAlbertForMultipleChoice

-[[autodoc]] FlaxAlbertForMultipleChoice
-    - __call__
+[[autodoc]] FlaxAlbertForMultipleChoice - **call**

 ## FlaxAlbertForTokenClassification

-[[autodoc]] FlaxAlbertForTokenClassification
-    - __call__
+[[autodoc]] FlaxAlbertForTokenClassification - **call**

 ## FlaxAlbertForQuestionAnswering

-[[autodoc]] FlaxAlbertForQuestionAnswering
-    - __call__
+[[autodoc]] FlaxAlbertForQuestionAnswering - **call**

 </jax>
 </frameworkcontent>
-
-
--- a/docs/source/en/model_doc/arcee.md
+++ b/docs/source/en/model_doc/arcee.md
@ -0,0 +1,104 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# Arcee
+
+Arcee is a decoder-only transformer model based on the Llama architecture with a key modification: it uses ReLU² (ReLU-squared) activation in the MLP blocks instead of SiLU, following recent research showing improved training efficiency with squared activations. This architecture is designed for efficient training and inference while maintaining the proven stability of the Llama design.
+
+The Arcee model is architecturally similar to Llama but uses `x * relu(x)` in MLP layers for improved gradient flow and is optimized for efficiency in both training and inference scenarios.
+
+> [!TIP]
+> The Arcee model supports extended context with RoPE scaling and all standard transformers features including Flash Attention 2, SDPA, gradient checkpointing, and quantization support.
+
+The example below demonstrates how to generate text with Arcee using [`Pipeline`] or the [`AutoModel`].
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="arcee-ai/AFM-4.5B",
+    torch_dtype=torch.float16,
+    device=0
+)
+
+output = pipeline("The key innovation in Arcee is")
+print(output[0]["generated_text"])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, ArceeForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("arcee-ai/AFM-4.5B")
+model = ArceeForCausalLM.from_pretrained(
+    "arcee-ai/AFM-4.5B",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+inputs = tokenizer("The key innovation in Arcee is", return_tensors="pt")
+with torch.no_grad():
+    outputs = model.generate(**inputs, max_new_tokens=50)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+</hfoption>
+</hfoptions>
+
+## ArceeConfig
+
+[[autodoc]] ArceeConfig
+
+## ArceeModel
+
+[[autodoc]] ArceeModel
+    - forward
+
+## ArceeForCausalLM
+
+[[autodoc]] ArceeForCausalLM
+    - forward
+
+## ArceeForSequenceClassification
+
+[[autodoc]] ArceeForSequenceClassification
+    - forward
+
+## ArceeForQuestionAnswering
+
+[[autodoc]] ArceeForQuestionAnswering
+    - forward
+
+## ArceeForTokenClassification
+
+[[autodoc]] ArceeForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -350,6 +350,10 @@ The following auto classes are available for the following audio tasks.

 [[autodoc]] AutoModelForTextToWaveform

+### AutoModelForAudioTokenization
+
+[[autodoc]] AutoModelForAudioTokenization
+
 ## Multimodal

 The following auto classes are available for the following multimodal tasks.
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -14,82 +14,32 @@ rendered properly in your Markdown viewer.

 -->

-# AyaVision
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

-## Overview
+# Aya Vision

-The Aya Vision 8B and 32B models is a state-of-the-art multilingual multimodal models developed by Cohere For AI. They build on the Aya Expanse recipe to handle both visual and textual information without compromising on the strong multilingual textual performance of the original model.
+[Aya Vision](https://huggingface.co/papers/2505.08751) is a family of open-weight multimodal vision-language models from Cohere Labs. It is trained with a synthetic annotation framework that generates high-quality multilingual image captions, improving Aya Vision's generated responses. In addition, a cross-modal model merging technique is used to prevent the model from losing its text capabilities after adding vision capabilities. The model combines a CommandR-7B language model with a SigLIP vision encoder.

-Aya Vision 8B combines the `Siglip2-so400-384-14` vision encoder with the Cohere CommandR-7B language model further post-trained with the Aya Expanse recipe, creating a powerful vision-language model capable of understanding images and generating text across 23 languages. Whereas, Aya Vision 32B uses Aya Expanse 32B as the language model.
+You can find all the original Aya Vision checkpoints under the [Aya Vision](https://huggingface.co/collections/CohereLabs/cohere-labs-aya-vision-67c4ccd395ca064308ee1484) collection.

-Key features of Aya Vision include:
- Multimodal capabilities in 23 languages
- Strong text-only multilingual capabilities inherited from CommandR-7B post-trained with the Aya Expanse recipe and Aya Expanse 32B
- High-quality visual understanding using the Siglip2-so400-384-14 vision encoder
- Seamless integration of visual and textual information in 23 languages.
+> [!TIP]
+> This model was contributed by [saurabhdash](https://huggingface.co/saurabhdash) and [yonigozlan](https://huggingface.co/yonigozlan).
+> 
+> Click on the Aya Vision models in the right sidebar for more examples of how to apply Aya Vision to different image-to-text tasks.

-<!-- <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/aya_vision_architecture.webp"
-alt="drawing" width="600"/>
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-<small> Aya Vision architecture. </small> -->
-
-Tips:
-
- Aya Vision is a multimodal model that takes images and text as input and produces text as output.
- Images are represented using the `<image>` tag in the templated input.
- For best results, use the `apply_chat_template` method of the processor to format your inputs correctly.
- The model can process multiple images in a single conversation.
- Aya Vision can understand and generate text in 23 languages, making it suitable for multilingual multimodal applications.
-
-This model was contributed by [saurabhdash](https://huggingface.co/saurabhdash) and [yonigozlan](https://huggingface.co/yonigozlan).
-
-
-## Usage
-
-Here's how to use Aya Vision for inference:
-
-```python
-from transformers import AutoProcessor, AutoModelForImageTextToText
-import torch
-
-model_id = "CohereForAI/aya-vision-8b"
-torch_device = "cuda:0"
-
-# Use fast image processor
-processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map=torch_device, torch_dtype=torch.float16
-)
-
-# Format message with the aya-vision chat template
-messages = [
-    {"role": "user",
-     "content": [
-       {"type": "image", "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium"},
-        {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"},
-    ]},
-    ]
-
-# Process image on CUDA
-inputs = processor.apply_chat_template(
-    messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device
-).to(model.device)
-
-gen_tokens = model.generate(
-    **inputs, 
-    max_new_tokens=300, 
-    do_sample=True, 
-    temperature=0.3,
-)
-
-gen_text = print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
-```
-### Pipeline
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
 from transformers import pipeline

-pipe = pipeline(model="CohereForAI/aya-vision-8b", task="image-text-to-text", device_map="auto")
+pipe = pipeline(model="CohereLabs/aya-vision-8b", task="image-text-to-text", device_map="auto")

 # Format message with the aya-vision chat template
 messages = [
@ -104,84 +54,108 @@ outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False)
 print(outputs)
 ```

-### Multiple Images and Batched Inputs
-
-Aya Vision can process multiple images in a single conversation. Here's how to use it with multiple images:
+</hfoption>
+<hfoption id="AutoModel">

 ```python
+# pip install 'git+https://github.com/huggingface/transformers.git@v4.49.0-Aya Vision'
 from transformers import AutoProcessor, AutoModelForImageTextToText
 import torch

-model_id = "CohereForAI/aya-vision-8b"
+model_id = "CohereLabs/aya-vision-8b"

 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="cuda:0", torch_dtype=torch.float16
+    model_id, device_map="auto", torch_dtype=torch.float16
 )

-# Example with multiple images in a single message
+# Format message with the aya-vision chat template
 messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
-            },
-            {
-                "type": "image",
-                "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
-            },
-            {
-                "type": "text",
-                "text": "These images depict two different landmarks. Can you identify them?",
-            },
-        ],
-    },
-]
+    {"role": "user",
+     "content": [
+       {"type": "image", "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium"},
+        {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"},
+    ]},
+    ]

 inputs = processor.apply_chat_template(
    messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
 ).to(model.device)

 gen_tokens = model.generate(
-    **inputs, 
-    max_new_tokens=300, 
-    do_sample=True, 
+    **inputs,
+    max_new_tokens=300,
+    do_sample=True,
    temperature=0.3,
 )

-gen_text = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-print(gen_text)
+print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
 ```

-For processing batched inputs (multiple conversations at once):
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory footprint of large models by representing weights at lower precision. Refer to the [Quantization](../quantization/overview) overview for supported backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.

 ```python
-from transformers import AutoProcessor, AutoModelForImageTextToText
 import torch
-
-model_id = "CohereForAI/aya-vision-8b"
-
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="cuda:0", torch_dtype=torch.float16
+from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    BitsAndBytesConfig
 )

-# Prepare two different conversations
-batch_messages = [
-    # First conversation with a single image
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True
+)
+
+processor = AutoProcessor.from_pretrained("CohereLabs/aya-vision-32b", use_fast=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    "CohereLabs/aya-vision-32b",
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+
+inputs = processor.apply_chat_template(
    [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-                {"type": "text", "text": "Write a haiku for this image"},
-            ],
-        },
+    {"role": "user", "content": [
+        {"type": "image", "url": "https://huggingface.co/roschmid/dog-races/resolve/main/images/Border_Collie.jpg"},
+        {"type": "text",  "text":"Describe what you see."}
+    ]}
    ],
-    # Second conversation with multiple images
-    [
+    padding=True,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_tensors="pt"
+).to("cuda")
+
+generated = model.generate(**inputs, max_new_tokens=50)
+print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- Images are represented with the `<image>` tag in the chat template.
+
+- Use the [`~ProcessorMixin.apply_chat_template`] method to correctly format inputs.
+
+- The example below demonstrates inference with multiple images.
+  
+    ```py
+    from transformers import AutoProcessor, AutoModelForImageTextToText
+    import torch
+        
+    processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b")
+    model = AutoModelForImageTextToText.from_pretrained(
+        "CohereForAI/aya-vision-8b", device_map="cuda", torch_dtype=torch.float16
+    )
+    
+    messages = [
        {
            "role": "user",
            "content": [
@ -199,35 +173,88 @@ batch_messages = [
                },
            ],
        },
-    ],
-]
-
-# Process each conversation separately and combine into a batch
-batch_inputs = processor.apply_chat_template(
-    batch_messages, 
-    padding=True, 
-    add_generation_prompt=True, 
-    tokenize=True, 
-    return_dict=True, 
-    return_tensors="pt"
-).to(model.device)
-
-# Generate responses for the batch
-batch_outputs = model.generate(
-    **batch_inputs,
-    max_new_tokens=300,
-    do_sample=True,
-    temperature=0.3,
-)
-
-# Decode the generated responses
-for i, output in enumerate(batch_outputs):
-    response = processor.tokenizer.decode(
-        output[batch_inputs.input_ids.shape[1]:], 
-        skip_special_tokens=True
+    ]
+    
+    inputs = processor.apply_chat_template(
+        messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+    ).to("cuda")
+    
+    gen_tokens = model.generate(
+        **inputs, 
+        max_new_tokens=300, 
+        do_sample=True, 
+        temperature=0.3,
    )
-    print(f"Response {i+1}:\n{response}\n")
-```
+    
+    gen_text = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    print(gen_text)
+    ```
+
+- The example below demonstrates inference with batched inputs.
+  
+    ```py
+    from transformers import AutoProcessor, AutoModelForImageTextToText
+    import torch
+        
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = AutoModelForImageTextToText.from_pretrained(
+        "CohereForAI/aya-vision-8b", device_map="cuda", torch_dtype=torch.float16
+    )
+    
+    batch_messages = [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                    {"type": "text", "text": "Write a haiku for this image"},
+                ],
+            },
+        ],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                    },
+                    {
+                        "type": "image",
+                        "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+                    },
+                    {
+                        "type": "text",
+                        "text": "These images depict two different landmarks. Can you identify them?",
+                    },
+                ],
+            },
+        ],
+    ]
+    
+    batch_inputs = processor.apply_chat_template(
+        batch_messages, 
+        padding=True, 
+        add_generation_prompt=True, 
+        tokenize=True, 
+        return_dict=True, 
+        return_tensors="pt"
+    ).to(model.device)
+    
+    batch_outputs = model.generate(
+        **batch_inputs,
+        max_new_tokens=300,
+        do_sample=True,
+        temperature=0.3,
+    )
+    
+    for i, output in enumerate(batch_outputs):
+        response = processor.tokenizer.decode(
+            output[batch_inputs.input_ids.shape[1]:], 
+            skip_special_tokens=True
+        )
+        print(f"Response {i+1}:\n{response}\n")
+    ```

 ## AyaVisionProcessor

--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -14,84 +14,127 @@ rendered properly in your Markdown viewer.

 -->

-# Bamba
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Bamba

-Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality.
+[Bamba](https://huggingface.co/blog/bamba) is a 9B parameter decoder-only language model built on the [Mamba-2](./mamba2) architecture. It is pretrained in two stages - it starts by training on 2T tokens from the [Dolma v1.7](https://huggingface.co/datasets/allenai/dolma) dataset and then trained on an additional 200B tokens from [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [Cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia).

-Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba).
+You can find all the original Bamba checkpoints under the [Bamba](https://huggingface.co/collections/ibm-ai-platform/bamba-674f1388b9bbc98b413c7bab) collection.
+
+> [!TIP]
+> This model was contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
+>
+> Click on the Bamba models in the right sidebar for more examples of how to apply Bamba to different text generation tasks.
+
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="ibm-ai-platform/Bamba-9B-v2",
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
+```
+
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
+model = AutoModelForCausalLM.from_pretrained("ibm-ai-platform/Bamba-9B-v2", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa")
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+
+<hfoption id="transformers CLI">
+```bash
+echo "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ibm-ai-platform/Bamba-9B-v2 --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
+model = AutoModelForCausalLM.from_pretrained(
+   "ibm-ai-platform/Bamba-9B-v2",
+   quantization_config=quantization_config,
+   device_map="auto",
+   attn_implementation="sdpa"
+)
+
+inputs = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+output = model.generate(**inputs)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- Bamba supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
+
+  Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
+
+  - `position_ids: torch.LongTensor`: the position index of each token in each sequence.
+  - `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
+  - Each of the [`FlashAttentionKwargs`]
+    - `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
+    - `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
+    - `max_length_q: int`: the longest query length in the batch.
+    - `max_length_k: int`: the longest key length in the batch.
+
+  The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
+
+  ```python
+  from transformers import DataCollatorWithFlattening
+
+  # Example of using padding-free training
+  data_collator = DataCollatorWithFlattening(
+      tokenizer=tokenizer,
+      return_seq_idx=True,
+      return_flash_attn_kwargs=True
+  )
+  ```

 ## BambaConfig

-| Model            | Params       | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length |  Tied Embeddings |
-|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------|
-| Bamba  | 9B (9.78B)   | 32       | 4096        | 32              | Yes | 8        | 4096           | True |
-
 [[autodoc]] BambaConfig

-<!---
-## Usage Tips
-
-Tips:
-
- The architecture is based on Mamba-2 models.
-
 ## BambaModel

 [[autodoc]] BambaModel
    - forward
-->

 ## BambaForCausalLM

-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B")
-tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B")
-
-message = ["Mamba is a snake with following properties  "]
-inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
-response = model.generate(**inputs, max_new_tokens=64)
-print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
-```
-
-
-## Padding-Free Training
-
-Bamba supports padding-free training in which distinct training examples can be concatenated
-together while nevertheless processing the inputs as though they belonged to separate batches. When
-the examples are of varying lengths, padding-free training can provide significant speed ups and
-memory savings compared to batching the examples together and using padding, as the unnecessary
-compute and memory due to padding is avoided entirely. The performance gains depend on factors such
-as the model and the data distribution, but throughput gains up to [~2x are commonly
-seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
-
-Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
-packages, and the following arguments must be passed to the model in addition to `input_ids` and
-`labels`:
-* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
-* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
-* Each of the [`FlashAttentionKwargs`]
-    * `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
-    * `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
-    * `max_length_q: int`: the longest query length in the batch.
-    * `max_length_k: int`: the longest key length in the batch.
-
-The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
-to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
-`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
-for additional information.
-
-
 [[autodoc]] BambaForCausalLM
    - forward
-
-This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@ -14,59 +14,123 @@ rendered properly in your Markdown viewer.

 -->

-# BigBirdPegasus
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# BigBirdPegasus

-The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://huggingface.co/papers/2007.14062) by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
+[BigBirdPegasus](https://huggingface.co/papers/2007.14062) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [BigBird](./big_bird) architecture with an additional pretraining objective borrowed from [Pegasus](./pegasus) called gap sequence generation (GSG). Whole sentences are masked and the model has to fill in the gaps in the document. BigBirdPegasus's ability to keep track of long contexts makes it effective at summarizing lengthy inputs, surpassing the performance of base Pegasus models.

-The abstract from the paper is the following:
+You can find all the original BigBirdPegasus checkpoints under the [Google](https://huggingface.co/google/models?search=bigbird-pegasus) organization.

-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
+> [!TIP]
+> This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta).
+>
+> Click on the BigBirdPegasus models in the right sidebar for more examples of how to apply BigBirdPegasus to different language tasks.

-The original code can be found [here](https://github.com/google-research/bigbird).
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
- The code currently uses window size of 3 blocks and 2 global blocks.
- Sequence length must be divisible by block size.
- Current implementation supports only **ITC**.
- Current implementation doesn't support **num_random_blocks = 0**.
- BigBirdPegasus uses the [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py).
- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="summarization",
+    model="google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.float32,
+    device=0
+)
+pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/bigbird-pegasus-large-arxiv --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- BigBirdPegasus also uses the [`PegasusTokenizer`].
+- Inputs should be padded on the right because BigBird uses absolute position embeddings.
+- BigBirdPegasus supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
+- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
+- The sequence length must be divisible by the block size.

 ## Resources

- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+Read the [Understanding BigBird's Block Sparse Attention](https://huggingface.co/blog/big-bird) blog post for more details about how BigBird's attention works.

 ## BigBirdPegasusConfig

--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -14,35 +14,76 @@ rendered properly in your Markdown viewer.

 -->

-# BLIP
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
 </div>

-## Overview
+# BLIP

-The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/papers/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.

-BLIP is a model that is able to perform various multi-modal tasks including:
- Visual Question Answering 
- Image-Text retrieval (Image-text matching)
- Image Captioning

-The abstract from the paper is the following:
+You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.

-*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. 
-However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
+> [!TIP]
+> This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
+> 
+> Click on the BLIP models in the right sidebar for more examples of how to apply BLIP to different vision language tasks.

-![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
+The example below demonstrates how to visual question answering with [`Pipeline`] or the [`AutoModel`] class.

-This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
-The original code can be found [here](https://github.com/salesforce/BLIP).
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="visual-question-answering",
+    model="Salesforce/blip-vqa-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+pipeline(question="What is the weather in this image?", image=url)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
+
+processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = AutoModelForVisualQuestionAnswering.from_pretrained(
+    "Salesforce/blip-vqa-base", 
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+question = "What is the weather in this image?"
+inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
+
+output = model.generate(**inputs)
+processor.batch_decode(output, skip_special_tokens=True)[0]
+```
+
+</hfoption>
+</hfoptions>

 ## Resources

- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
+Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) to learn how to fine-tune BLIP for image captioning on a custom dataset.

 ## BlipConfig

--- a/docs/source/en/model_doc/bros.md
+++ b/docs/source/en/model_doc/bros.md
@ -62,11 +62,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):

    box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)

-    # encode(tokenize) each word from words (List[str])
-    input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
+    # encode(tokenize) each word from words (list[str])
+    input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]

    # get the length of each box
-    tokens_length_list: List[int] = [len(l) for l in input_ids_list]
+    tokens_length_list: list[int] = [len(l) for l in input_ids_list]

    box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
    box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -191,6 +191,11 @@ model = ChameleonForConditionalGeneration.from_pretrained(
 [[autodoc]] ChameleonImageProcessor
    - preprocess

+## ChameleonImageProcessorFast
+
+[[autodoc]] ChameleonImageProcessorFast
+    - preprocess
+
 ## ChameleonVQVAE

 [[autodoc]] ChameleonVQVAE
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -3,6 +3,7 @@
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -4,6 +4,7 @@
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/cvt.md
+++ b/docs/source/en/model_doc/cvt.md
@ -14,49 +14,77 @@ rendered properly in your Markdown viewer.

 -->

-# Convolutional Vision Transformer (CvT)
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
 </div>

-## Overview
+# Convolutional Vision Transformer (CvT)

-The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://huggingface.co/papers/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs.
+Convolutional Vision Transformer (CvT) is a model that combines the strengths of convolutional neural networks (CNNs) and Vision transformers for the computer vision tasks. It introduces convolutional layers into the vision transformer architecture, allowing it to capture local patterns in images while maintaining the global context provided by self-attention mechanisms.

-The abstract from the paper is the following:
+You can find all the CvT checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=cvt) organization.

-*We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) 
-in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through 
-two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer 
-block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) 
-to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention, 
-global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves 
-state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition, 
-performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on 
-ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, 
-a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks.*
+> [!TIP]
+> This model was contributed by [anujunj](https://huggingface.co/anugunj).
+> 
+> Click on the CvT models in the right sidebar for more examples of how to apply CvT to different computer vision tasks.

-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
+The example below demonstrates how to classify an image with [`Pipeline`] or the [`AutoModel`] class.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- CvT models are regular Vision Transformers, but trained with convolutions. They outperform the [original model (ViT)](vit) when fine-tuned on ImageNet-1K and CIFAR-100.
- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`AutoImageProcessor`] and [`ViTForImageClassification`] by [`CvtForImageClassification`]).
- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="image-classification",
+    model="microsoft/cvt-13",
+    torch_dtype=torch.float16,
+    device=0 
+)
+pipeline(images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoModelForImageClassification, AutoImageProcessor
+
+image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+model = AutoModelForImageClassification.from_pretrained(
+    "microsoft/cvt-13",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = image_processor(image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+  logits = model(**inputs).logits
+predicted_class_id = logits.argmax(dim=-1).item()
+
+class_labels = model.config.id2label
+predicted_class_label = class_labels[predicted_class_id]
+print(f"The predicted class label is: {predicted_class_label}")
+```
+
+</hfoption>
+</hfoptions>

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CvT.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`CvtForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+Refer to this set of ViT [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) for examples of inference and fine-tuning on custom datasets. Replace [`ViTFeatureExtractor`] and [`ViTForImageClassification`] in these notebooks with [`AutoImageProcessor`] and [`CvtForImageClassification`].

 ## CvtConfig

--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@ -14,66 +14,111 @@ rendered properly in your Markdown viewer.

 -->

-# DeBERTa-v2
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
 </div>

-## Overview

-The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+# DeBERTa-v2

-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
+[DeBERTa-v2](https://huggingface.co/papers/2006.03654) improves on the original [DeBERTa](./deberta) architecture by using a SentencePiece-based tokenizer and a new vocabulary size of 128K. It also adds an additional convolutional layer within the first transformer layer to better learn local dependencies of input tokens. Finally, the position projection and content projection matrices are shared in the attention layer to reduce the number of parameters.

-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+You can find all the original [DeBERTa-v2] checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=deberta-v2) organization.


-The following information is visible directly on the [original implementation
-repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
-the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
-find more details about this submission in the authors'
-[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
+> [!TIP]
+> This model was contributed by [Pengcheng He](https://huggingface.co/DeBERTa).
+>
+> Click on the DeBERTa-v2 models in the right sidebar for more examples of how to apply DeBERTa-v2 to different language tasks.

-New in v2:
+The example below demonstrates how to classify text with [`Pipeline`] or the [`AutoModel`] class.

- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
-  Instead of a GPT2-based tokenizer, the tokenizer is now
-  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
-  transformer layer to better learn the local dependency of input tokens.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
-  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
-  similar to T5.
- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
-  performance of downstream tasks.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).
+```py
+import torch
+from transformers import pipeline

-## Resources
+pipeline = pipeline(
+    task="text-classification",
+    model="microsoft/deberta-v2-xlarge-mnli",
+    device=0,
+    torch_dtype=torch.float16
+)
+result = pipeline("DeBERTa-v2 is great at understanding context!")
+print(result)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "microsoft/deberta-v2-xlarge-mnli"
+)
+model = AutoModelForSequenceClassification.from_pretrained(
+    "microsoft/deberta-v2-xlarge-mnli",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+outputs = model(**inputs)
+
+logits = outputs.logits
+predicted_class_id = logits.argmax().item()
+predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+
+```
+
+</hfoption>
+
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "DeBERTa-v2 is great at understanding context!" | transformers-cli run --task fill-mask --model microsoft/deberta-v2-xlarge-mnli --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes quantization](../quantization/bitsandbytes) to only quantize the weights to 4-bit.
+
+```py
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "microsoft/deberta-v2-xlarge-mnli"
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype="float16",
+    bnb_4bit_use_double_quant=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_id,
+    quantization_config=quantization_config,
+    torch_dtype="float16"
+)
+
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+outputs = model(**inputs)
+logits = outputs.logits
+predicted_class_id = logits.argmax().item()
+predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+
+```

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)

 ## DebertaV2Config

--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -149,7 +149,7 @@ As a summary, consider the following table:
 | **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
 | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
 | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
-| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
+| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `list[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |

--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -0,0 +1,162 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Dia
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+## Overview
+
+Dia is an opensource text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
+It can generate highly realistic dialogue from transcript including nonverbal communications such as laughter and coughing.
+Furthermore, emotion and tone control is also possible via audio conditioning (voice cloning).
+
+**Model Architecture:**
+Dia is an encoder-decoder transformer based on the original transformer architecture. However, some more modern features such as
+rotational positional embeddings (RoPE) are also included. For its text portion (encoder), a byte tokenizer is utilized while
+for the audio portion (decoder), a pretrained codec model [DAC](./dac.md) is used - DAC encodes speech into discrete codebook
+tokens and decodes them back into audio.
+
+## Usage Tips
+
+### Generation with Text
+
+```python
+from transformers import AutoProcessor, DiaForConditionalGeneration
+
+torch_device = "cuda"
+model_checkpoint = "buttercrab/dia-v1-1.6b"
+
+text = ["[S1] Dia is an open weights text to dialogue model."]
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+inputs = processor(text=text, padding=True, return_tensors="pt").to(torch_device)
+
+model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
+outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
+
+# save audio to a file
+outputs = processor.batch_decode(outputs)
+processor.save_audio(outputs, "example.wav")
+
+```
+
+### Generation with Text and Audio (Voice Cloning)
+
+```python
+from datasets import load_dataset, Audio
+from transformers import AutoProcessor, DiaForConditionalGeneration
+
+torch_device = "cuda"
+model_checkpoint = "buttercrab/dia-v1-1.6b"
+
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+ds = ds.cast_column("audio", Audio(sampling_rate=44100))
+audio = ds[-1]["audio"]["array"]
+# text is a transcript of the audio + additional text you want as new audio
+text = ["[S1] I know. It's going to save me a lot of money, I hope. [S2] I sure hope so for you."]
+
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+inputs = processor(text=text, audio=audio, padding=True, return_tensors="pt").to(torch_device)
+prompt_len = processor.get_audio_prompt_len(inputs["decoder_attention_mask"])
+
+model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
+outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
+
+# retrieve actually generated audio and save to a file
+outputs = processor.batch_decode(outputs, audio_prompt_len=prompt_len)
+processor.save_audio(outputs, "example_with_audio.wav")
+```
+
+### Training
+
+```python
+from datasets import load_dataset, Audio
+from transformers import AutoProcessor, DiaForConditionalGeneration
+
+torch_device = "cuda"
+model_checkpoint = "buttercrab/dia-v1-1.6b"
+
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+ds = ds.cast_column("audio", Audio(sampling_rate=44100))
+audio = ds[-1]["audio"]["array"]
+# text is a transcript of the audio
+text = ["[S1] I know. It's going to save me a lot of money, I hope."]
+
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+inputs = processor(
+    text=text,
+    audio=audio,
+    generation=False,
+    output_labels=True,
+    padding=True,
+    return_tensors="pt"
+).to(torch_device)
+
+model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
+out = model(**inputs)
+out.loss.backward()
+```
+
+
+This model was contributed by [Jaeyong Sung](https://huggingface.co/buttercrab), [Arthur Zucker](https://huggingface.co/ArthurZ),
+and [Anton Vlasjuk](https://huggingface.co/AntonV). The original code can be found [here](https://github.com/nari-labs/dia/).
+
+
+## DiaConfig
+
+[[autodoc]] DiaConfig
+
+## DiaDecoderConfig
+
+[[autodoc]] DiaDecoderConfig
+
+## DiaEncoderConfig
+
+[[autodoc]] DiaEncoderConfig
+
+## DiaTokenizer
+
+[[autodoc]] DiaTokenizer
+    - __call__
+
+## DiaFeatureExtractor
+
+[[autodoc]] DiaFeatureExtractor
+    - __call__
+
+## DiaProcessor
+
+[[autodoc]] DiaProcessor
+    - __call__
+    - batch_decode
+    - decode
+
+## DiaModel
+
+[[autodoc]] DiaModel
+    - forward
+
+## DiaForConditionalGeneration
+
+[[autodoc]] DiaForConditionalGeneration
+    - forward
+    - generate
--- a/docs/source/en/model_doc/dots1.md
+++ b/docs/source/en/model_doc/dots1.md
@ -0,0 +1,40 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# dots.llm1
+
+## Overview
+
+The `dots.llm1` model was proposed in [dots.llm1 technical report](https://www.arxiv.org/pdf/2506.05767) by rednote-hilab team.
+
+The abstract from the report is the following:
+
+*Mixture of Experts (MoE) models have emerged as a promising paradigm for scaling language models efficiently by activating only a subset of parameters for each input token. In this report, we present dots.llm1, a large-scale MoE model that activates 14B parameters out of a total of 142B parameters, delivering performance on par with state-of-the-art models while reducing training and inference costs. Leveraging our meticulously crafted and efficient data processing pipeline, dots.llm1 achieves performance comparable to Qwen2.5-72B after pretraining on high-quality corpus and post-training to fully unlock its capabilities. Notably, no synthetic data is used during pretraining. To foster further research, we open-source intermediate training checkpoints spanning the entire training process, providing valuable insights into the learning dynamics of large language models.*
+
+
+## Dots1Config
+
+[[autodoc]] Dots1Config
+
+## Dots1Model
+
+[[autodoc]] Dots1Model
+    - forward
+
+## Dots1ForCausalLM
+
+[[autodoc]] Dots1ForCausalLM
+    - forward
--- a/docs/source/en/model_doc/dpt.md
+++ b/docs/source/en/model_doc/dpt.md
@ -78,7 +78,13 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] DPTImageProcessor
    - preprocess
+
+## DPTImageProcessorFast
+
+[[autodoc]] DPTImageProcessorFast
+    - preprocess
    - post_process_semantic_segmentation
+    - post_process_depth_estimation

 ## DPTModel

--- a/docs/source/en/model_doc/eomt.md
+++ b/docs/source/en/model_doc/eomt.md
@ -0,0 +1,214 @@
+<!--Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# EoMT
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The Encoder-only Mask Transformer (EoMT) model was introduced in the CVPR 2025 Highlight Paper [Your ViT is Secretly an Image Segmentation Model](https://www.tue-mps.org/eomt) by Tommie Kerssies, Niccolò Cavagnero, Alexander Hermans, Narges Norouzi, Giuseppe Averta, Bastian Leibe, Gijs Dubbelman, and Daan de Geus.
+EoMT reveals Vision Transformers can perform image segmentation efficiently without task-specific components.
+
+The abstract from the paper is the following:
+
+*Vision Transformers (ViTs) have shown remarkable performance and scalability across various computer vision tasks. To apply single-scale ViTs to image segmentation, existing methods adopt a convolutional adapter to generate multi-scale features, a pixel decoder to fuse these features, and a Transformer decoder that uses the fused features to make predictions. In this paper, we show that the inductive biases introduced by these task-specific components can instead be learned by the ViT itself, given sufficiently large models and extensive pre-training. Based on these findings, we introduce the Encoder-only Mask Transformer (EoMT), which repurposes the plain ViT architecture to conduct image segmentation. With large-scale models and pre-training, EoMT obtains a segmentation accuracy similar to state-of-the-art models that use task-specific components. At the same time, EoMT is significantly faster than these methods due to its architectural simplicity, e.g., up to 4x faster with ViT-L. Across a range of model sizes, EoMT demonstrates an optimal balance between segmentation accuracy and prediction speed, suggesting that compute resources are better spent on scaling the ViT itself rather than adding architectural complexity.*
+
+This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
+The original code can be found [here](https://github.com/tue-mps/eomt).
+
+## Architecture Info
+
+The `EoMT` model uses a DINOv2-pretrained Vision Transformer with **register tokens** as its backbone. EoMT simplifies the segmentation pipeline by relying solely on the encoder, eliminating the need for task-specific decoders commonly used in prior approaches.
+
+Architecturally, EoMT introduces a small set of **learned queries** and a lightweight **mask prediction module**. These queries are injected into the final encoder blocks, enabling **joint attention** between image patches and object queries. During training, **masked attention** is applied to constrain each query to focus on its corresponding region—effectively mimicking cross-attention. This constraint is gradually phased out via a **mask annealing strategy**, allowing for **efficient, decoder-free inference** without compromising segmentation performance.
+
+<div style="text-align: center;">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/eomt_architecture.png"
+       alt="drawing" width="500"/>
+</div>
+
+
+The model supports semantic, instance, and panoptic segmentation using a unified architecture and task-specific post-processing.
+
+## Usage Examples
+
+Use the Hugging Face implementation of EoMT for inference with pre-trained models.
+
+### Semantic Segmentation
+
+The EoMT model performs semantic segmentation using sliding-window inference. The input image is resized such that the shorter side matches the target input size, then it is split into overlapping crops. Each crop is then passed through the model. After inference, the predicted logits from each crop are stitched back together and rescaled to the original image size to get the final segmentation mask.
+
+> **Note:**  
+> If you want to use a custom target size for **semantic segmentation**, specify it in the following format:  
+> `{"shortest_edge": 512}`  
+> Notice that `longest_edge` is not provided here — this is intentional. For semantic segmentation, images are typically **scaled so that the shortest edge is greater than or equal to the target size** hence longest_edge is not necessary.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/ade20k_semantic_eomt_large_512"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+# Remove Patch Offsets from inputs — only used later for post-processing.
+patch_offsets = inputs.pop("patch_offsets")
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+original_image_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_semantic_segmentation(
+    outputs,
+    patch_offsets=patch_offsets,
+    original_image_sizes=original_image_sizes,
+)
+
+# Visualize the segmentation mask
+plt.imshow(preds[0])
+plt.axis("off")
+plt.title("Semantic Segmentation")
+plt.show()
+```
+
+### Instance Segmentation
+
+The EoMT model performs instance segmentation using padded inference. The input image is resized so that the longer side matches the target input size, and the shorter side is zero-padded to form a square. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified instance segmentation map, along with segment metadata like segment id, class labels and confidence scores.
+
+> **Note:**  
+> To use a custom target size, specify the size as a dictionary in the following format:  
+> `{"shortest_edge": 512, "longest_edge": 512}`  
+> For both instance and panoptic segmentation, input images will be **scaled and padded** to this target size.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/coco_instance_eomt_large_640"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+original_image_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_instance_segmentation(
+    outputs,
+    original_image_sizes=original_image_sizes,
+)
+
+# Visualize the segmentation mask
+plt.imshow(preds[0]["segmentation"])
+plt.axis("off")
+plt.title("Instance Segmentation")
+plt.show()
+```
+
+### Panoptic Segmentation
+
+The EoMT model performs panoptic segmentation using the same padded inference strategy as in instance segmentation. After padding and normalization, the model predicts both thing (instances) and stuff (amorphous regions) classes. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified panoptic segmentation map, along with segment metadata like segment id, class labels and confidence scores.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/coco_panoptic_eomt_large_640"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+original_image_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_panoptic_segmentation(
+    outputs,
+    original_image_sizes=original_image_sizes,
+)
+
+# Visualize the panoptic segmentation mask
+plt.imshow(preds[0]["segmentation"])
+plt.axis("off")
+plt.title("Panoptic Segmentation")
+plt.show()
+```
+
+## EomtImageProcessor
+
+[[autodoc]] EomtImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## EomtImageProcessorFast
+
+[[autodoc]] EomtImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## EomtConfig
+
+[[autodoc]] EomtConfig
+
+## EomtForUniversalSegmentation
+
+[[autodoc]] EomtForUniversalSegmentation
+    - forward
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -22,6 +22,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -0,0 +1,204 @@
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# Gemma3n
+
+## Overview
+
+Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
+large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
+this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
+[MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses
+a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
+every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
+[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
+trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
+
+The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
+
+You can find all the original Gemma 3n checkpoints under the [Gemma 3n][gemma3n-collection] release.
+
+> [!TIP]
+> Click on the Gemma 3n models in the right sidebar for more examples of how to apply Gemma to different vision, audio,
+> and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="image-text-to-text",
+    model="google/gemma-3n-e4b",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+pipeline(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+    text="<start_of_image> What is shown in this image?"
+)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+
+model = Gemma3nForConditionalGeneration.from_pretrained(
+    "google/gemma-3n-e4b-it",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained(
+    "google/gemma-3n-e4b-it",
+    padding_side="left"
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful assistant."}
+        ]
+    },
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to("cuda")
+
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3n-e2b --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+-   Use [`Gemma3nForConditionalGeneration`] for image-audio-and-text, image-and-text, image-and-audio, audio-and-text,
+    image-only and aduio-only inputs.
+-   Gemma 3n supports multiple images per input, but make sure the images are correctly batched before passing them to
+    the processor. Each batch should be a list of one or more images.
+
+    ```py
+    url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4="
+    url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+
+    messages =[
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": url_cow},
+                {"type": "image", "url": url_cat},
+                {"type": "text", "text": "Which image is cuter?"},
+            ]
+        },
+    ]
+    ```
+-   Text passed to the processor should have a `<image_soft_token>` token wherever an image should be inserted.
+-   Gemma 3n accept at most one target audio clip per input, though multiple audio clips can be provided in few-shot
+    prompts, for example.
+-   Text passed to the processor should have a `<audio_soft_token>` token wherever an audio clip should be inserted.
+-   The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs.
+
+## Gemma3nAudioFeatureExtractor
+
+[[autodoc]] Gemma3nAudioFeatureExtractor
+
+## Gemma3nProcessor
+
+[[autodoc]] Gemma3nProcessor
+
+## Gemma3nTextConfig
+
+[[autodoc]] Gemma3nTextConfig
+
+## Gemma3nVisionConfig
+
+[[autodoc]] Gemma3nVisionConfig
+
+## Gemma3nAudioConfig
+
+[[autodoc]] Gemma3nAudioConfig
+
+## Gemma3nConfig
+
+[[autodoc]] Gemma3nConfig
+
+## Gemma3nTextModel
+
+[[autodoc]] Gemma3nTextModel
+    - forward
+
+## Gemma3nModel
+
+[[autodoc]] Gemma3nModel
+    - forward
+
+## Gemma3nForCausalLM
+
+[[autodoc]] Gemma3nForCausalLM
+    - forward
+
+## Gemma3nForConditionalGeneration
+
+[[autodoc]] Gemma3nForConditionalGeneration
+    - forward
+
+[altup]: https://proceedings.neurips.cc/paper_files/paper/2023/hash/f2059277ac6ce66e7e5543001afa8bb5-Abstract-Conference.html
+[attention-mask-viz]: https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139
+[gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
+[laurel]: https://arxiv.org/abs/2411.07501
+[matformer]: https://arxiv.org/abs/2310.07707
+[usm]: https://arxiv.org/abs/2303.01037
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/glm4v.md
+++ b/docs/source/en/model_doc/glm4v.md
@ -0,0 +1,180 @@
+<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
+</div>
+
+# GLM-4.1V
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+pipe = pipeline(
+    task="image-text-to-text",
+    model="THUDM/GLM-4.1V-9B-Thinking",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+            },
+            { "type": "text", "text": "Describe this image."},
+        ]
+    }
+]
+pipe(text=messages,max_new_tokens=20, return_full_text=False)
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import Glm4vForConditionalGeneration, AutoProcessor
+
+model = Glm4vForConditionalGeneration.from_pretrained(
+    "THUDM/GLM-4.1V-9B-Thinking",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+messages = [
+    {
+        "role":"user",
+        "content":[
+            {
+                "type":"image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+            },
+            {
+                "type":"text",
+                "text":"Describe this image."
+            }
+        ]
+    }
+
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to("cuda")
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+       generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+</hfoption>
+</hfoptions>
+
+Using GLM-4.1V with video input is similar to using it with image input.
+The model can process video data and generate text based on the content of the video.
+
+```python
+from transformers import AutoProcessor, Glm4vForConditionalGeneration
+import torch
+
+processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking")
+model = Glm4vForConditionalGeneration.from_pretrained(
+    pretrained_model_name_or_path="THUDM/GLM-4.1V-9B-Thinking",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda:0"
+)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+            },
+            {
+                "type": "text",
+                "text": "discribe this video",
+            },
+        ],
+    }
+]
+inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True).to("cuda:0")
+generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=1.0)
+output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+print(output_text)
+```
+
+## Glm4vConfig
+
+[[autodoc]] Glm4vConfig
+
+## Glm4vTextConfig
+
+[[autodoc]] Glm4vTextConfig
+
+## Glm4vImageProcessor
+
+[[autodoc]] Glm4vImageProcessor
+    - preprocess
+
+## Glm4vVideoProcessor
+
+[[autodoc]] Glm4vVideoProcessor
+    - preprocess
+
+## Glm4vImageProcessorFast
+
+[[autodoc]] Glm4vImageProcessorFast
+    - preprocess
+
+## Glm4vProcessor
+
+[[autodoc]] Glm4vProcessor
+
+## Glm4vTextModel
+
+[[autodoc]] Glm4vTextModel
+    - forward
+
+## Glm4vModel
+
+[[autodoc]] Glm4vModel
+    - forward
+
+## Glm4vForConditionalGeneration
+
+[[autodoc]] Glm4vForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/granite.md
+++ b/docs/source/en/model_doc/granite.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Granite
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -162,7 +162,7 @@ To load and run a model using Flash Attention-2, simply change the code snippet
 ```diff
 model = Idefics2ForConditionalGeneration.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
-+    torch_dtype=torch.float16,    
+    torch_dtype=torch.float16,
 +    attn_implementation="flash_attention_2",
 ).to(device)
 ```
@ -184,7 +184,7 @@ Quantizing a model is as simple as passing a `quantization_config` to the model.
 + )
 model = Idefics2ForConditionalGeneration.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
-+    torch_dtype=torch.float16,    
+    torch_dtype=torch.float16,
 +    quantization_config=quantization_config,
 ).to(device)
 ```
@ -218,7 +218,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] Idefics2ImageProcessor
    - preprocess

+## Idefics2ImageProcessorFast
+[[autodoc]] Idefics2ImageProcessorFast
+    - preprocess

 ## Idefics2Processor
 [[autodoc]] Idefics2Processor
-    - __call__
+    - __call__
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@ -80,6 +80,9 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
 [[autodoc]] Idefics3ImageProcessor
    - preprocess

+## Idefics3ImageProcessorFast
+[[autodoc]] Idefics3ImageProcessorFast
+    - preprocess

 ## Idefics3Processor
 [[autodoc]] Idefics3Processor
--- a/docs/source/en/model_doc/kyutai_speech_to_text.md
+++ b/docs/source/en/model_doc/kyutai_speech_to_text.md
@ -0,0 +1,122 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Kyutai Speech-To-Text 
+## Overview
+
+Kyutai STT is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai’s lab has released two model checkpoints:
+- [kyutai/stt-1b-en_fr](https://huggingface.co/kyutai/stt-1b-en_fr): a 1B-parameter model capable of transcribing both English and French
+- [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en): a 2.6B-parameter model focused solely on English, optimized for maximum transcription accuracy
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/kyutai_stt.png"/>
+</div>
+
+## Usage Tips
+
+### Inference
+
+```python
+import torch
+from datasets import load_dataset, Audio
+from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+# 1. load the model and the processor
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "kyutai/stt-2.6b-en-trfs"
+
+processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
+
+# 2. load audio samples
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+)
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+
+# 3. prepare the model inputs
+inputs = processor(
+    ds[0]["audio"]["array"],
+)
+inputs.to(torch_device)
+
+# 4. infer the model
+output_tokens = model.generate(**inputs)
+
+# 5. decode the generated tokens
+print(processor.batch_decode(output_tokens, skip_special_tokens=True))
+```
+
+### Batched Inference
+
+```python
+import torch
+from datasets import load_dataset, Audio
+from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+# 1. load the model and the processor
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "kyutai/stt-2.6b-en-trfs"
+
+processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
+
+# 2. load audio samples
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+)
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+
+# 3. prepare the model inputs
+audio_arrays = [ds[i]["audio"]["array"] for i in range(4)]
+inputs = processor(audio_arrays, return_tensors="pt", padding=True)
+inputs = inputs.to(torch_device)
+
+# 4. infer the model
+output_tokens = model.generate(**inputs)
+
+# 5. decode the generated tokens
+decoded_outputs = processor.batch_decode(output_tokens, skip_special_tokens=True)
+for output in decoded_outputs:
+    print(output)
+```
+
+This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+## KyutaiSpeechToTextConfig
+
+[[autodoc]] KyutaiSpeechToTextConfig
+
+## KyutaiSpeechToTextProcessor
+
+[[autodoc]] KyutaiSpeechToTextProcessor
+    - __call__
+
+## KyutaiSpeechToTextFeatureExtractor
+
+[[autodoc]] KyutaiSpeechToTextFeatureExtractor
+
+## KyutaiSpeechToTextForConditionalGeneration
+
+[[autodoc]] KyutaiSpeechToTextForConditionalGeneration
+    - forward
+    - generate
+
+## KyutaiSpeechToTextModel
+
+[[autodoc]] KyutaiSpeechToTextModel
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -0,0 +1,104 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the MIT License; you may not use this file except in compliance with
+the License.
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+
+-->
+
+# LightGlue
+
+## Overview
+
+The LightGlue model was proposed in [LightGlue: Local Feature Matching at Light Speed](https://arxiv.org/abs/2306.13643)
+by Philipp Lindenberger, Paul-Edouard Sarlin and Marc Pollefeys.
+
+Similar to [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor), this model consists of matching
+two sets of local features extracted from two images, its goal is to be faster than SuperGlue. Paired with the 
+[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and 
+estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.
+
+The abstract from the paper is the following:
+
+*We introduce LightGlue, a deep neural network that learns to match local features across images. We revisit multiple
+design decisions of SuperGlue, the state of the art in sparse matching, and derive simple but effective improvements. 
+Cumulatively, they make LightGlue more efficient - in terms of both memory and computation, more accurate, and much
+easier to train. One key property is that LightGlue is adaptive to the difficulty of the problem: the inference is much
+faster on image pairs that are intuitively easy to match, for example because of a larger visual overlap or limited
+appearance change. This opens up exciting prospects for deploying deep matchers in latency-sensitive applications like
+3D reconstruction. The code and trained models are publicly available at this [https URL](https://github.com/cvg/LightGlue)*
+
+## How to use
+
+Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. 
+The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding 
+matching scores.
+```python
+from transformers import AutoImageProcessor, AutoModel
+import torch
+from PIL import Image
+import requests
+
+url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
+image1 = Image.open(requests.get(url_image1, stream=True).raw)
+url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
+image2 = Image.open(requests.get(url_image2, stream=True).raw)
+
+images = [image1, image2]
+
+processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
+model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
+
+inputs = processor(images, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+```
+
+You can use the `post_process_keypoint_matching` method from the `LightGlueImageProcessor` to get the keypoints and matches in a readable format:
+```python
+image_sizes = [[(image.height, image.width) for image in images]]
+outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+for i, output in enumerate(outputs):
+    print("For the image pair", i)
+    for keypoint0, keypoint1, matching_score in zip(
+            output["keypoints0"], output["keypoints1"], output["matching_scores"]
+    ):
+        print(
+            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
+        )
+```
+
+You can visualize the matches between the images by providing the original images as well as the outputs to this method:
+```python
+processor.plot_keypoint_matching(images, outputs)
+```
+
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/duPp09ty8NRZlMZS18ccP.png)
+
+This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+The original code can be found [here](https://github.com/cvg/LightGlue).
+
+## LightGlueConfig
+
+[[autodoc]] LightGlueConfig
+
+## LightGlueImageProcessor
+
+[[autodoc]] LightGlueImageProcessor
+
+- preprocess
+- post_process_keypoint_matching
+- plot_keypoint_matching
+
+## LightGlueForKeypointMatching
+
+[[autodoc]] LightGlueForKeypointMatching
+
+- forward
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -21,6 +21,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
        ">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ```py3
--- a/docs/source/en/model_doc/llama4.md
+++ b/docs/source/en/model_doc/llama4.md
@ -21,6 +21,7 @@ rendered properly in your Markdown viewer.
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -83,7 +83,7 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t
 ```

 </hfoption>
-</hfoptions
+</hfoptions>


 ## Notes
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -22,6 +22,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@ -95,6 +95,12 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

+## MobileViTImageProcessorFast
+
+[[autodoc]] MobileViTImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/nougat.md
+++ b/docs/source/en/model_doc/nougat.md
@ -107,6 +107,11 @@ The model is identical to [Donut](donut) in terms of architecture.
 [[autodoc]] NougatImageProcessor
    - preprocess

+## NougatImageProcessorFast
+
+[[autodoc]] NougatImageProcessorFast
+    - preprocess
+
 ## NougatTokenizerFast

 [[autodoc]] NougatTokenizerFast
--- a/docs/source/en/model_doc/olmo.md
+++ b/docs/source/en/model_doc/olmo.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/pegasus_x.md
+++ b/docs/source/en/model_doc/pegasus_x.md
@ -14,35 +14,115 @@ rendered properly in your Markdown viewer.

 -->

-# PEGASUS-X
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
 </div>

-## Overview
+# PEGASUS-X

-The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://huggingface.co/papers/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.
+[PEGASUS-X](https://huggingface.co/papers/2208.04347) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [Pegasus](./pegasus) model with staggered block-local attention, global encoder tokens, and additional pretraining on long text sequences, enabling it to handle inputs of up to 16,000 tokens. PEGASUS-X matches the performance of much larger models while using fewer parameters.

-PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder.
+You can find all the original PEGASUS-X checkpoints under the [Google](https://huggingface.co/google/models?search=pegasus-x) organization.

-The abstract from the paper is the following:
+> [!TIP]
+> This model was contributed by [zphang](https://huggingface.co/zphang).
+>
+> Click on the PEGASUS-X models in the right sidebar for more examples of how to apply PEGASUS-X to different language tasks.

-*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

-This model was contributed by [zphang](https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-## Documentation resources
+```py
+import torch
+from transformers import pipeline

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+pipeline = pipeline(
+    task="summarization",
+    model="google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
+```
+</hfoption>
+<hfoption id="AutoModel">

-<Tip>
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

-PEGASUS-X uses the same tokenizer as [PEGASUS](pegasus).
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/pegasus-x-large"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)

-</Tip>
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/pegasus-x-large --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/pegasus-x-large"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- PEGASUS-X also uses the [`PegasusTokenizer`].

 ## PegasusXConfig

--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Qwen2MoE
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/roc_bert.md
+++ b/docs/source/en/model_doc/roc_bert.md
@ -14,39 +14,78 @@ rendered properly in your Markdown viewer.

 -->

-# RoCBert
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+   <div class="flex flex-wrap space-x-1">
+          <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+   </div>
 </div>

-## Overview
+# RoCBert

-The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
-It's a pretrained Chinese language model that is robust under various forms of adversarial attacks.
+[RoCBert](https://aclanthology.org/2022.acl-long.65.pdf) is a pretrained Chinese [BERT](./bert) model designed against adversarial attacks like typos and synonyms. It is pretrained with a contrastive learning objective to align normal and adversarial text examples. The examples include different semantic, phonetic, and visual features of Chinese. This makes RoCBert more robust against manipulation.

-The abstract from the paper is the following:
+You can find all the original RoCBert checkpoints under the [weiweishi](https://huggingface.co/weiweishi) profile.

-*Large-scale pretrained language models have achieved SOTA results on NLP tasks. However, they have been shown
-vulnerable to adversarial attacks especially for logographic languages like Chinese. In this work, we propose
-ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation,
-synonyms, typos, etc. It is pretrained with the contrastive learning objective which maximizes the label consistency
-under different synthesized adversarial examples. The model takes as input multimodal information including the
-semantic, phonetic and visual features. We show all these features are important to the model robustness since the
-attack can be performed in all the three forms. Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under
-three blackbox adversarial algorithms without sacrificing the performance on clean testset. It also performs the best
-in the toxic content detection task under human-made attacks.*
+> [!TIP]
+> This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
+> 
+> Click on the RoCBert models in the right sidebar for more examples of how to apply RoCBert to different Chinese language tasks.

-This model was contributed by [weiweishi](https://huggingface.co/weiweishi).
+The example below demonstrates how to predict the [MASK] token with [`Pipeline`], [`AutoModel`], and from the command line.

-## Resources
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+   task="fill-mask",
+   model="weiweishi/roc-bert-base-zh",
+   torch_dtype=torch.float16,
+   device=0
+)
+pipeline("這家餐廳的拉麵是我[MASK]過的最好的拉麵之")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+   "weiweishi/roc-bert-base-zh",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+   "weiweishi/roc-bert-base-zh",
+   torch_dtype=torch.float16,
+   device_map="auto",
+)
+inputs = tokenizer("這家餐廳的拉麵是我[MASK]過的最好的拉麵之", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+   outputs = model(**inputs)
+   predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "這家餐廳的拉麵是我[MASK]過的最好的拉麵之" | transformers-cli run --task fill-mask --model weiweishi/roc-bert-base-zh --device 0
+```
+
+</hfoption>
+</hfoptions>

 ## RoCBertConfig

--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/smollm3.md
+++ b/docs/source/en/model_doc/smollm3.md
@ -0,0 +1,173 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# SmolLM3
+
+SmolLM3 is a fully open, compact language model designed for efficient deployment while maintaining strong performance. It uses a Transformer decoder architecture with Grouped Query Attention (GQA) to reduce the kv cache, and no RoPE, enabling improved performance on long-context tasks. It is trained using a multi-stage training approach on high-quality public datasets across web, code, and math domains. The model is multilingual and supports very large context lengths. The instruct variant is optimized for reasoning and tool use.
+
+> [!TIP]
+> Click on the SmolLM3 models in the right sidebar for more examples of how to apply SmolLM3 to different language tasks.
+
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line using the instruction-tuned models.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="text-generation",
+    model="HuggingFaceTB/SmolLM3-3B",
+    torch_dtype=torch.bfloat16,
+    device_map=0
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Tell me about yourself."},
+]
+outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
+print(outputs[0]["generated_text"][-1]['content'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceTB/SmolLM3-3B",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
+
+prompt = "Give me a short introduction to large language models."
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
+
+generated_ids = model.generate(
+    model_inputs.input_ids,
+    cache_implementation="static",
+    max_new_tokens=512,
+    do_sample=True,
+    temperature=0.7,
+    top_k=50,
+    top_p=0.95
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+# pip install -U flash-attn --no-build-isolation
+transformers chat HuggingFaceTB/SmolLM3-3B --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
+
+```python
+# pip install -U flash-attn --no-build-isolation
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
+model = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceTB/SmolLM3-3B",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config,
+    attn_implementation="flash_attention_2"
+)
+
+inputs = tokenizer("Gravity is the force", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+
+## Notes
+
+- Ensure your Transformers library version is up-to-date. SmolLM3 requires Transformers>=4.53.0 for full support.
+
+## SmolLM3Config
+
+[[autodoc]] SmolLM3Config
+
+## SmolLM3Model
+
+[[autodoc]] SmolLM3Model
+    - forward
+
+## SmolLM3ForCausalLM
+
+[[autodoc]] SmolLM3ForCausalLM
+    - forward
+
+## SmolLM3ForSequenceClassification
+
+[[autodoc]] SmolLM3ForSequenceClassification
+    - forward
+
+## SmolLM3ForTokenClassification
+
+[[autodoc]] SmolLM3ForTokenClassification
+    - forward
+
+## SmolLM3ForQuestionAnswering
+
+[[autodoc]] SmolLM3ForQuestionAnswering
+    - forward
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -32,7 +32,7 @@ SmolVLM2 is an adaptation of the Idefics3 model with two main differences:

 Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.

-Videos should not be upsampled. 
+Videos should not be upsampled.

 If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*512 pixels by default.
 The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 512}` is the default, but you can change it to a different value if needed.
@ -192,11 +192,14 @@ print(generated_texts[0])
 [[autodoc]] SmolVLMForConditionalGeneration
    - forward

-
 ## SmolVLMImageProcessor
 [[autodoc]] SmolVLMImageProcessor
    - preprocess

+## SmolVLMImageProcessorFast
+[[autodoc]] SmolVLMImageProcessorFast
+    - preprocess
+
 ## SmolVLMVideoProcessor
 [[autodoc]] SmolVLMVideoProcessor
    - preprocess
--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -10,48 +10,35 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
 # SuperPoint

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-## Overview
-
-The SuperPoint model was proposed
-in [SuperPoint: Self-Supervised Interest Point Detection and Description](https://huggingface.co/papers/1712.07629) by Daniel
-DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-
-This model is the result of a self-supervised training of a fully-convolutional network for interest point detection and
-description. The model is able to detect interest points that are repeatable under homographic transformations and
-provide a descriptor for each point. The use of the model in its own is limited, but it can be used as a feature
-extractor for other tasks such as homography estimation, image matching, etc.
-
-The abstract from the paper is the following:
-
-*This paper presents a self-supervised framework for training interest point detectors and descriptors suitable for a
-large number of multiple-view geometry problems in computer vision. As opposed to patch-based neural networks, our
-fully-convolutional model operates on full-sized images and jointly computes pixel-level interest point locations and
-associated descriptors in one forward pass. We introduce Homographic Adaptation, a multi-scale, multi-homography
-approach for boosting interest point detection repeatability and performing cross-domain adaptation (e.g.,
-synthetic-to-real). Our model, when trained on the MS-COCO generic image dataset using Homographic Adaptation, is able
-to repeatedly detect a much richer set of interest points than the initial pre-adapted deep model and any other
-traditional corner detector. The final system gives rise to state-of-the-art homography estimation results on HPatches
-when compared to LIFT, SIFT and ORB.*
+[SuperPoint](https://huggingface.co/papers/1712.07629) is the result of self-supervised training of a fully-convolutional network for interest point detection and description. The model is able to detect interest points that are repeatable under homographic transformations and provide a descriptor for each point. Usage on it's own is limited, but it can be used as a feature extractor for other tasks such as homography estimation and image matching.

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/superpoint_architecture.png"
 alt="drawing" width="500"/>

-<small> SuperPoint overview. Taken from the <a href="https://huggingface.co/papers/1712.07629v4">original paper.</a> </small>
+You can find all the original SuperPoint checkpoints under the [Magic Leap Community](https://huggingface.co/magic-leap-community) organization.

-## Usage tips
+> [!TIP]
+> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+>
+> Click on the SuperPoint models in the right sidebar for more examples of how to apply SuperPoint to different computer vision tasks.

-Here is a quick example of using the model to detect interest points in an image:

-```python
+
+The example below demonstrates how to detect interest points in an image with the [`AutoModel`] class.
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
 from transformers import AutoImageProcessor, SuperPointForKeypointDetection
 import torch
 from PIL import Image
@ -64,67 +51,76 @@ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint"
 model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

 inputs = processor(image, return_tensors="pt")
-outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Post-process to get keypoints, scores, and descriptors
+image_size = (image.height, image.width)
+processed_outputs = processor.post_process_keypoint_detection(outputs, [image_size])
 ```

-The outputs contain the list of keypoint coordinates with their respective score and description (a 256-long vector).
+</hfoption>
+</hfoptions>

-You can also feed multiple images to the model. Due to the nature of SuperPoint, to output a dynamic number of keypoints,
-you will need to use the mask attribute to retrieve the respective information :
+## Notes

-```python
-from transformers import AutoImageProcessor, SuperPointForKeypointDetection
-import torch
-from PIL import Image
-import requests
+- SuperPoint outputs a dynamic number of keypoints per image, which makes it suitable for tasks requiring variable-length feature representations.

-url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
-url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
-image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+    ```py
+    from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+    import torch
+    from PIL import Image
+    import requests
+    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+    model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+    url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+    url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
+    image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+    images = [image_1, image_2]
+    inputs = processor(images, return_tensors="pt")
+    # Example of handling dynamic keypoint output
+    outputs = model(**inputs)
+    keypoints = outputs.keypoints  # Shape varies per image
+    scores = outputs.scores        # Confidence scores for each keypoint
+    descriptors = outputs.descriptors  # 256-dimensional descriptors
+    mask = outputs.mask # Value of 1 corresponds to a keypoint detection
+    ```

-images = [image_1, image_2]
+- The model provides both keypoint coordinates and their corresponding descriptors (256-dimensional vectors) in a single forward pass.
+- For batch processing with multiple images, you need to use the mask attribute to retrieve the respective information for each image. You can use the `post_process_keypoint_detection` from the `SuperPointImageProcessor` to retrieve the each image information.

-processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
-model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+    ```py
+    # Batch processing example
+    images = [image1, image2, image3]
+    inputs = processor(images, return_tensors="pt")
+    outputs = model(**inputs)
+    image_sizes = [(img.height, img.width) for img in images]
+    processed_outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+    ```

-inputs = processor(images, return_tensors="pt")
-outputs = model(**inputs)
-image_sizes = [(image.height, image.width) for image in images]
-outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+- You can then print the keypoints on the image of your choice to visualize the result:
+    ```py
+    import matplotlib.pyplot as plt
+    plt.axis("off")
+    plt.imshow(image_1)
+    plt.scatter(
+        outputs[0]["keypoints"][:, 0],
+        outputs[0]["keypoints"][:, 1],
+        c=outputs[0]["scores"] * 100,
+        s=outputs[0]["scores"] * 50,
+        alpha=0.8
+    )
+    plt.savefig(f"output_image.png")
+    ```

-for output in outputs:
-    for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]):
-        print(f"Keypoints: {keypoints}")
-        print(f"Scores: {scores}")
-        print(f"Descriptors: {descriptors}")
-```
-
-You can then print the keypoints on the image of your choice to visualize the result:
-```python
-import matplotlib.pyplot as plt
-
-plt.axis("off")
-plt.imshow(image_1)
-plt.scatter(
-    outputs[0]["keypoints"][:, 0],
-    outputs[0]["keypoints"][:, 1],
-    c=outputs[0]["scores"] * 100,
-    s=outputs[0]["scores"] * 50,
-    alpha=0.8
-)
-plt.savefig(f"output_image.png")
-```
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png)
-
-This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
-The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).
+<div class="flex justify-center">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png">
+</div>

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SuperPoint. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
- A notebook showcasing inference and visualization with SuperPoint can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb). 🌎
+- Refer to this [noteboook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb) for an inference and visualization example.

 ## SuperPointConfig

@ -137,8 +133,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 - preprocess
 - post_process_keypoint_detection

+<frameworkcontent>
+<pt>
 ## SuperPointForKeypointDetection

 [[autodoc]] SuperPointForKeypointDetection

 - forward
+
+</pt>
--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@ -0,0 +1,107 @@
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# T5Gemma
+
+T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large langauge models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
+
+T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2) sizes (2B-2B, 9B-2B, and 9B-9B), which are based on the offical Gemma 2 models (2B and 9B); and 2) [T5](https://arxiv.org/abs/1910.10683) sizes (Small, Base, Large, and XL), where are pretrained under the Gemma 2 framework following T5 configuration. In addition, we also provide a model at ML size (medium large, ~2B in total), which is in-between T5 Large and T5 XL.
+
+The pretrained varaints are trained with two objectives: prefix language modeling with knowledge distillation (PrefixLM) and UL2, separately. We release both variants for each model size. The instruction-turned varaints was post-trained with supervised fine-tuning and reinforcement learning.
+
+The example below demonstrates how to chat with the model with [`Pipeline`] or the [`AutoModel`] class, and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="text2text-generation",
+    model="google/t5gemma-placeholder",
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+)
+
+pipe("Question: Why is the sky blue?\nAnswer:", max_new_tokens=50)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-placeholder")
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/t5gemma-placeholder",
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+
+input_text = "Question: Why is the sky blue?\nAnswer:"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=32)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```
+echo -e "Question: Why is the sky blue? Answer:" | transformers run --task text2text-generation --model google/t5gemma-placeholder --device 0
+```
+
+## T5GemmaConfig
+
+[[autodoc]] T5GemmaConfig
+
+## T5GemmaModuleConfig
+
+[[autodoc]] T5GemmaModuleConfig
+
+## T5GemmaModel
+
+[[autodoc]] T5GemmaModel
+    - forward
+
+## T5GemmaEncoderModel
+
+[[autodoc]] T5GemmaEncoderModel
+    - forward
+
+## T5GemmaForConditionalGeneration
+
+[[autodoc]] T5GemmaForConditionalGeneration
+    - forward
+
+## T5GemmaForSequenceClassification
+
+[[autodoc]] T5GemmaForSequenceClassification
+    - forward
+
+## T5GemmaForTokenClassification
+
+[[autodoc]] T5GemmaForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/trocr.md
+++ b/docs/source/en/model_doc/trocr.md
@ -56,6 +56,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
  on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM
  Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more
  information, see the [official models](https://huggingface.co/models?other=trocr>).
+- [Fine‑tune TrOCR on your own OCR dataset](https://github.com/Ashutosh-4485/trocr-custom-fine-tune.git).
 - TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.

 ## Resources
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -83,7 +83,7 @@ def read_video_pyav(container, indices):
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
-        indices (`List[int]`): List of frame indices to decode.
+        indices (`list[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
--- a/docs/source/en/models.md
+++ b/docs/source/en/models.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 Transformers provides many pretrained models that are ready to use with a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method.

-Call [`~PreTrainedModel.from_pretrained`] to download and load a models weights and configuration stored on the Hugging Face [Hub](https://hf.co/models).
+Call [`~PreTrainedModel.from_pretrained`] to download and load a model's weights and configuration stored on the Hugging Face [Hub](https://hf.co/models).

 > [!TIP]
 > The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load.
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -1,4 +1,4 @@
-# Modular Transformers
+# Contributing a new model to Transformers

 Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance.

@ -24,7 +24,7 @@ A linter "unravels" the modular file into a `modeling.py` file to preserve the s
 Run the command below to automatically generate a `modeling.py` file from a modular file.

 ```bash
-python utils/modular_model_converter.py --files_to_parse src/transformers/models/<your_model>/modular_<your_model>.py
+python utils/modular_model_converter.py --files-to-parse src/transformers/models/<your_model>/modular_<your_model>.py
 ```

 For example:
@ -216,12 +216,12 @@ class Olmo2Attention(OlmoAttention):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -294,9 +294,9 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states

        # Self Attention
@ -494,7 +494,7 @@ class LlamaForCausalLM(nn.Module):
      input_ids: torch.LongTensor = None,
      attention_mask: Optional[torch.Tensor] = None,
      position_ids: Optional[torch.LongTensor] = None,
-      past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+      past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
      inputs_embeds: Optional[torch.FloatTensor] = None,
      labels: Optional[torch.LongTensor] = None,
      use_cache: Optional[bool] = None,
@ -520,7 +520,7 @@ class NewModelForCausalLM(LlamaForCausalLM):    |    class LlamaForCausalLM(nn.M
                                                |         input_ids: torch.LongTensor = None,
                                                |         attention_mask: Optional[torch.Tensor] = None,
                                                |         position_ids: Optional[torch.LongTensor] = None,
-                                                |         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None,
+                                                |         past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = |None,
                                                |         inputs_embeds: Optional[torch.FloatTensor] = None,
                                                |         labels: Optional[torch.LongTensor] = None,
                                                |         use_cache: Optional[bool] = None,
@ -540,6 +540,9 @@ This makes it very easy to switch decorators and makes it explicit that the only

 ## Docstring variables

+> [!TIP]
+> Refer to the [Documeting a model](./auto_docstring) guide for more information about how you can use the `@auto_docstring` decorator to help automatically generate consistent docstring arguments.
+
 If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file.

 This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2).
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -13,21 +13,19 @@ rendered properly in your Markdown viewer.

 -->

-# Tensor parallelism in transformers
+# Distributed inference

-[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
-This document assumes that you are already familiar with the basics of tensor parallelism. If you are not, please refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism.
+When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple accelerators (CUDA GPU, Intel XPU, etc.) and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each accelerator can process a tensor slice.
+
+However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple accelerators to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case.

 > [!TIP]
-> Tensor parallelism is very communication intensive, therefore it is reccomended to use it on a single machine with multiple GPUs, utilizing fast intra-node communication. For multi-node training, methods as pipeline or data parallelism are more efficient (depending on your use case).
+> Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism to learn more.

-Tensor parallelism requires slight changes to the model parameters, therefore in transformers, we support some of the popular models out of the box.
-
-> [!TIP]
-> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.
+Check the list below for models that natively support tensor parallelism. Open a GitHub issue or pull request to add support for a model.

 <details>
-<summary>Supported models</summary>
+<summary>Show supported models</summary>

 * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2)
 * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2)
@ -43,19 +41,74 @@ Tensor parallelism requires slight changes to the model parameters, therefore in

 </details>

-## Using 🤗 transformers
+This guide shows how to enable tensor parallelism with Transformers and different partitioning strategies.

-Transformers provides a simple interface to use for tensor parallelism. We provide multiple classes implementing different partitioning
-strategies and a simple entrypoint to parallelize `nn.Module` instance. You won't have to interact with this interface directly, everything is done in `PretrainedModel.from_pretrained` method for you. This section will first talk about the partitioning strategies
-we support, then the user interface you will be interacting with, and finally it will teach you how to extend it with your own partitioning
-strategies.
+## Partitioning a model

-### Partitioning strategies
+Transformers supports tensor parallelism if a model has a `tp_plan`. There are two plans to partition a model.

-In transformers, partitioning strategies reside in a class `ParallelInterface` which works like a mapping from string to the strategy implementation.
+- The `auto` tensor parallelism plan partitions a model (see the supported models above) based on a predefined configuration.
+- You can also manually specify your own partitioning plan and pass it to the `tp_plan` parameter in [`~PreTrainedModel.from_pretrained`].

+<hfoptions id="sharding">
+<hfoption id="auto plan">

-```python
+```py
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # better for smaller number of GPUs
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
+print(model._tp_plan)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+
+# distributed run
+outputs = model(inputs)
+```
+
+Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
+
+```bash
+torchrun --nproc-per-node 4 demo.py
+```
+
+</hfoption>
+<hfoption id="manual plan">
+
+Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses a combination of column and row partitioning. Refer to the [Partitioning strategies](#partitioning-strategies) section to learn about other supported partitioning strategies.
+
+> [!WARNING]
+> Manually specifying your own partitioning plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about the partitioning strategies, the resulting model can be very slow, even failing or incorrect. Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) to learn more.
+
+```py
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+print(model._tp_plan)
+```
+
+</hfoption>
+</hfoptions>
+
+## Partitioning strategies
+
+All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available.
+
+```py
 class ParallelInterface(MutableMapping):
    """
    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
@ -77,66 +130,32 @@ class ParallelInterface(MutableMapping):
    }
 ```

-We support the following strategies:
+Refer to the table below to learn more about each strategy.

- `ColwiseParallel` - A simple column-wise partitioning, being able to handle both weights and biases, does exactly what we've discussed before.
- `RowwiseParallel` - Again, row-wise partitioning as dicussed before, supports weights and biases, on top of that it also supports `nn.Embedding` modules.
- `SequenceParallel` - Sequence parallel implementation, for support of `LayerNorm` and `Dropout` layers. Also supports Python implementation of `RMSNorm` (see [this](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34))
- `PackedColwiseParallel` - A variant of column-wise partitioning, however it works on packed weights (i.e. `up_proj` and `gate_proj` being packed together). For more details, see [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)
- `PackedRowwiseParallel` - A variant of row-wise partitioning, works on packed weights, for more details check the comment linked above.
- `GatherParallel` - A very simple class, that only makes the outputs of the module to be gathered across devices.
- `IsolatedParallel` - This is a special case, where we want to *isolate* the module from the rest of the devices (world). This is used for Experts in MoE layers, basically creating Expert parallelism of sorts.
- `ReplicateParallel` - Many `torch.distributed` APIs break if model is partially sharded, so this class is used to replicate the module across all devices.
+| Strategy | Description |
+|---|---|
+| `ColwiseParallel` | Column-wise partitioning of weights and biases. |
+| `RowwiseParallel` | Row-wise partitioning of weights and biases. Also supports partitioning `nn.Embedding` modules. |
+| `SequenceParallel` | Sequence parallel implementation to support `LayerNorm` and `Dropout` layers. Also supports Python implementation of [RMSNorm](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34). |
+| `PackedColwiseParallel` | Variant of `ColwiseParallel` to support packed weights (for example, packing `up_proj` and `gate_proj` together). Refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details. |
+| `PackedRowwiseParallel` | Variant of `RowwiseParallel` to support packed weights (refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details). |
+| `GatherParallel` | Gather outputs of the module across devices. |
+| `IsolatedParallel` | Used for Experts in Mixture-of-Experts (MoE) layers to isolates module from other devices. |
+| `ReplicateParallel` | Replicate modules across all devices to prevent `torch.distributed` APIs from breaking due to a partially sharded model. |

-### Sharding a model
+### Packed strategies

-We provide two ways to shard a model, first one is to use `auto` tensor parallelism plan, which will automatically shard the model based on our predefined configuration. This requires the model to have predefined tensor parallel plan in transformers.
+Weight packing packs multiple linear layers into a single, bigger layer. Packed strategies, `PackedColwiseParallel` and `PackedRowwiseParallel`, are used to shard packed weights. The more basic `ColwiseParallel` or `RowwiseParallel` will incorrectly shard the packed weights.

-```python
-from transformers import AutoModelForCausalLM
+The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` module and requires the `PackedRowwiseParallel` strategy to shard `gate_up_proj`.

-# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
-
-print(model._tp_plan)
-```
-
-> [!TIP]
-> For a list of models that support tensor parallelism, see the [Supported models](#supported-models) section above.
-
-The second way is to manually specify your own partitioning plan.
-
-```python
-from transformers import AutoModelForCausalLM
-
-tp_plan = {
-    "model.layers.*.self_attn.q_proj": "colwise",
-    "model.layers.*.self_attn.k_proj": "colwise",
-    "model.layers.*.self_attn.v_proj": "colwise",
-    "model.layers.*.self_attn.o_proj": "rowwise",
-    ...
-}
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-
-print(model._tp_plan)
-```
-
-You might have noticed that there are some special cases in the `ParallelInterface` mapping, let's now talk about them. This will help you understand their purpose and help with extending to other strategies.
-
-### PackedRowwiseParallel
-This class is a special case of `RowwiseParallel`, it's used to shard packed weights. Weight packing is a common technique used in models. It's a technique where we pack multiple linear layers into a single, bigger one.
-
-For example in `Llama4` model, we pack `up_proj` and `gate_proj` into a single `gate_up_proj` module.
 ```python
 class Llama4TextExperts(nn.Module):
    ...
    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

-Then in forward, we can use batch matrix multiplication to compute the output of the `gate_up_proj` module.
+Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.

 ```python
 def forward(self, hidden_states):
@ -145,185 +164,148 @@ def forward(self, hidden_states):
    gate, up = gate_up.chunk(2, dim=-1) # Split the output into gate and up
 ```

-In this case, we need to use the `PackedRowwiseParallel` strategy to shard the `gate_up_proj` module, as using a simple `RowwiseParallel` will shard the layers wrongly.
-
 > [!TIP]
-> If this is a bit difficult to wrap your head around, check out [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an amazing visual representation of why `Packed*` needs to be used.
+> Refer to [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an visual representation of why `Packed*` needs to be used.

+### Local strategies

-### `local*` strategies
+Local strategies (`local_colwise`, `local_rowwise`, `local_packed_rowwise`) don't use [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) because it isn't supported for some operations such as [torch.chunk](https://docs.pytorch.org/docs/stable/generated/torch.chunk.html). Instead, local strategies use the basic [torch.Tensor](https://docs.pytorch.org/docs/stable/tensors.html) and performs some of the distributed logic manually.

-You could have noticed that there are `local*` strategies, which use the same layers as `*` strategy, but don't use `DTensor` at all.
-This is because `DTensor` is not supported for some of the operations: such as `torch.chunk`. Therefore, sometimes we need to use the `local*` strategies, which use vanilla `torch.Tensor` and do some of the distributed logic manually.
-
-<!---
+<!--
 Readd this when I get the exact error message
 > [!TIP]
 > If you are using a custom partitioning strategy, and it's not working with `... is not supported` error, try using the `local*` strategies to see if they work better.
 -->

-> [!WARNING]
-> Manually specifying your own partitiong plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about this, the resulting model can be very slow, even failing or incorrect. Again, refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) which can teach you everything required.
+## Custom partitioning strategies

-### Extending the interface with your own partitioning strategies
+A custom partitioning strategy should inherit from [`TensorParallelLayer`](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`.

-This is a very advanced topic, which requires a good understanding of distributed collectives and the model architecture.
-Your custom partitioning strategy should inherit from `TensorParallelLayer` defined in [integrations/tensor_parallel.py](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement: `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. Then it should be registered in the `ParallelInterface` mapping, so our dispatching logic can find it when specified in the `tp_plan`.
+Then it needs to be registered in the `ParallelInterface` mapping so the dispatching logic can find it when specified in `tp_plan`.

-Let's go through this workflow step by step, on an already existing example: `ColwiseParallel`.
+The example below shows how to implement `ColwiseParallel` with this workflow.

-1. Inherit from `TensorParallelLayer` and initialization
+1. Inherit from `TensorParallelLayer`. In the `__init__` method, define `input_layouts` and `output_layouts` to describe how the input and output tensors should be placed on devices. The `desired_input_layouts` attribute is used to specify how the input *should* be placed on devices.

-```python
-class ColwiseParallel(TensorParallelLayer):
-    def __init__(
+    ```python
+    class ColwiseParallel(TensorParallelLayer):
+        def __init__(
+            self,
+            *,
+            input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
+            output_layouts: Optional[Placement] = None, # The output layout we want to achieve
+            use_local_output: bool = True, # Whether to use local output or not
+            use_dtensor=True, # Whether to use DTensor or not
+        ):
+            self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
+            self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
+            self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
+            self.use_local_output = use_local_output
+            self.use_dtensor = use_dtensor
+    ```
+
+2. Implement the `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn` methods.
+
+    The `partition_tensor` method partitions the tensor and fills `empty_param` with the partitioned tensor. Use the utility function `get_tensor_shard` to help you get the correct shard of the original parameter for a given rank and `get_packed_weights` to help with packed weights.
+
+    ```python
+    def partition_tensor(
        self,
-        *,
-        input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
-        output_layouts: Optional[Placement] = None, # The output layout we want to achieve
-        use_local_output: bool = True, # Whether to use local output or not
-        use_dtensor=True, # Whether to use DTensor or not
-    ):
-        self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
-        self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
-        self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
-        self.use_local_output = use_local_output
-        self.use_dtensor = use_dtensor
-```
+        param, # Full tensor of the parameter
+        empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
+        param_type, # Type of the parameter, `bias` or `weight`
+        param_casting_dtype, # The type to cast the parameter to
+        to_contiguous, # Whether to convert the tensor to a contiguous memory layout
+        rank, # The rank of the current device
+        device_mesh, # The device mesh
+    ) -> nn.Parameter: # Return the partitioned parameter
+        ...
+    ```

-In the `__init__` method, we define these attributes, where `input_layouts` and `output_layouts` describing, how the input and output tensors should be placed on the devices. `desired_input_layouts` is used to specify, how the input *SHOULD* be placed on the devices.
+    The `_prepare_input_fn` and `_prepare_output_fn` methods are used in the [pre-forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks. They redistribute the inputs and outputs to the desired layout as specified in the `__init__`.

-2a. Implement `partition_tensor` method
+    ```python
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        ...
+        # Do some custom logic, cast to DTensor etc.
+        ...
+        return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        ...
+        # Do some custom logic, cast to DTensor etc.
+        ...
+        return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
+    ```

-```python
-def partition_tensor(
-    self,
-    param, # Full tensor of the parameter
-    empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
-    param_type, # Type of the parameter, `bias` or `weight`
-    param_casting_dtype, # The type to cast the parameter to
-    to_contiguous, # Whether to convert the tensor to a contiguous memory layout
-    rank, # The rank of the current device
-    device_mesh, # The device mesh
-) -> nn.Parameter: # Return the partitioned parameter
-    ...
-```
+3. Register the strategy to [`ParallelInterface`] to enable it for use with `tp_plan`.

-This method is used to partition the tensor, and fill the `empty_param` with the partitioned tensor.
-We provide some utility functions to help you with this, such as `get_tensor_shard` which will get you the correct shard of the original parameter for this rank or `get_packed_weights` to help with packed weights.
+    ```python
+    from transformers.integrations.tensor_parallel import ParallelInterface

-2b. Implement `_prepare_input_fn` and `_prepare_output_fn` methods
+    ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
+    tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise_custom",
+        ...
+    }
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+    ```

-These methods are used as [`pre-forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [`forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks respectively. Their purpose is to re-distribute the inputs and outputs to the desired layout, passed in the `__init__` method.
+## Benchmarks

-```python
-def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
-    ...
-    # Do some custom logic, cast to DTensor etc.
-    ...
-    return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+Tensor parallelism can considerably speedup inference, especially for inputs with large batch sizes or long sequences.

-def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-    ...
-    # Do some custom logic, cast to DTensor etc.
-    ...
-    return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
-```
-
-3. Register the strategy
-Congratulations! You've implemented your own partitioning strategy. Now, to use it with your own `tp_plan`, you need to register it in the `ParallelInterface` mapping.
-
-```python
-from transformers.integrations.tensor_parallel import ParallelInterface
-
-ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
-```
-
-And now you can use it in your `tp_plan` as such:
-
-```python
-tp_plan = {
-    "model.layers.*.self_attn.q_proj": "colwise_custom",
-    ...
-}
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-```
-
-
-## Full example
-
-Let's go through a full example of inference with tensor parallelism.
-```python
-import os
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-# enable tensor parallelism
-model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    tp_plan="auto",
-)
-
-# prepare input tokens
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-prompt = "Can I help"
-inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-
-# distributed run
-outputs = model(inputs)
-```
-
-Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
-
-```bash
-torchrun --nproc-per-node 4 demo.py
-```
-
-You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.
-
-For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.
+Refer to the chart below for the expected speedup for a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512.

 <div style="text-align: center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
 </div>

-## Tensor parallelism in-depth
-Our implementation of tensor parallelism is framework-agnostic in design, but the specific implementations we've developed rely on the torch.distributed package. We heavily utilize abstractions such as `DeviceMesh` or `DTensor` to provide a simple and extensible interface to the user.
+## Design implementation
+
+The Transformers tensor parallelism implementation is framework-agnostic, but for specific implementations, we rely on [DeviceMesh](https://docs.pytorch.org/tutorials/recipes/distributed_device_mesh.html) and [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) from [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) to provide a simple and extensible interface.

 ### DeviceMesh
-Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, therefore we can create a `DeviceMesh` with multiple submeshes:
+
+Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, so you can create a `DeviceMesh` with multiple sub-meshes.
+
 ```python
 from torch.distributed.device_mesh import init_device_mesh

 # Create a 1D mesh of 4 GPUs
 device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"])
 ```
-Then, most of the `torch.distributed` defined parallelization strategies can be applied to a mesh itself, or its submesh, automatically handling the communication patterns.
+
+Most of the `torch.distributed` defined parallelization strategies can be applied to the mesh itself, or its sub-mesh, and it automatically handles the communication patterns.

 ### DTensor

-Abbreviation for Distributed Tensor, `DTensor` is a tensor subclass that handles the distributed logic on-top of the usual tensor operations. Most of the model weights in case of tensor parallelism are stored as `DTensor`s (with some exceptions, more on that later).
-The most important part of DTensor, that is crucial to understand, is the `placement` attribute. It's an attribute that tells PyTorch how is the tensor placed on the devices of the `DeviceMesh`.
+`DTensor` (Distributed Tensor) is a tensor subclass that handles the distributed logic on top of the usual tensor operations. Most of the model weights in tensor parallelism are stored as `DTensor`s.

-It can have the following values:
+The most important part of DTensor is the `placement` attribute because it tells PyTorch how a tensor is placed on the devices in `DeviceMesh`. The `placement` attribute can take the following values.

- `Shard(dimension)` - Annotates that this `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. For example, if we would like to shard weights for column-wise partitioning, we would do:
-```python
-weight = ...
-weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
-bias = ...
-bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
-```
+- `Shard(dimension)` - Indicates how a `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. The example below demonstrates how to shard weights over different dimensions for column-wise partitioning.

-To give another example, for row-wise partitioning, we would do:
-```python
-weight = ...
-weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
-bias = ...
-bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
-```
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
+    ```

- `Replicate()` - Annotates that this `DTensor` is replicated across the `DeviceMesh`. Very straight-forward, only creates a full copy of the tensor on each device.
- `Partial()` - This placement is mostly of no interest to us, it's used to annotate that this tensor is pending a reduction operation.
+    This example demonstrates how to shard weights over different dimensions for row-wise partitioning.
+
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+    ```
+
+- `Replicate()` - Indicates a `DTensor` is replicated across the `DeviceMesh`. It only creates a full copy of the tensor on each device.
+
+    ```py
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+    ```
+
+- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -91,6 +91,8 @@ Tensor parallelism distributes large tensor computations across multiple GPUs. T

 Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs.

+Refer to the [Tensor parallelism](./perf_infer_gpu_multi) guide to learn how to use it for inference.
+
 ## Hybrid parallelism

 Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters.
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@ -31,7 +31,7 @@ Refer to the table below to quickly help you identify the features relevant to y
 | data preloading | yes | no |
 | torch_empty_cache_steps | no | yes |
 | torch.compile | yes | no |
-| PEFT | no | yes |
+| scaled dot production attention (SDPA) | yes | yes |

 ## Trainer

@ -128,7 +128,7 @@ fp16 isn't memory-optimized because the gradients that are computed in fp16 are

 [bf16](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus) trades off some precision for a much larger dynamic range, which is helpful for avoiding overflow and underflow errors. You can use bf16 without adding any loss scaling methods like you would with fp16. bf16 is supported by NVIDIAs Ampere architecture or newer.

-Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type.
+Configure [`~TrainingArguments.bf16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type.

 ```py
 from transformers import TrainingArguments
--- a/docs/source/en/quantization/finegrained_fp8.md
+++ b/docs/source/en/quantization/finegrained_fp8.md
@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)

 output = quantized_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible.
 | Component | Compatibility |
 |----------|----------------|
 | CUDA Versions | ✅ cu118, cu126, cu128 |
+| XPU Versions | ✅ pytorch2.8 |
 | CPU | ✅ change `device_map="cpu"` (see examples below) |


@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>

+### Intel XPU
+<hfoptions id="examples-Intel-XPU">
+<hfoption id="int8-dynamic-and-weight-only">
+    
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# or int8 weight only quantization
+# quant_config = Int8WeightOnlyConfig()
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+
+<hfoption id="int4-weight-only">
+
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
+from torchao.dtypes import Int4XPULayout
+from torchao.quantization.quant_primitives import ZeroPointDomain
+
+
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+
 ### CPU
 <hfoptions id="examples-CPU">
 <hfoption id="int8-dynamic-and-weight-only">
@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Manual Testing
 prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
 generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
 output_text = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)

 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")

 ## Loading quantized models

-Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
+Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
    quantization_config=quantization_config
 )
 # save the quantized model
-output_dir = "llama-3.1-8b-torchao-int8-cuda"
+output_dir = "llama-3.1-8b-torchao-int8"
 quantized_model.save_pretrained(output_dir, safe_serialization=False)

 # reload the quantized model
@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)

 output = reloaded_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@ -32,12 +32,29 @@ To start, we recommend creating a Hugging Face [account](https://hf.co/join). An

 Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and log in to your account.

+<hfoptions id="authenticate">
+<hfoption id="notebook">
+
+Paste your User Access Token into [`~huggingface_hub.notebook_login`] when prompted to log in.
+
 ```py
 from huggingface_hub import notebook_login

 notebook_login()
 ```

+</hfoption>
+<hfoption id="CLI">
+   
+Make sure the [huggingface_hub[cli]](https://huggingface.co/docs/huggingface_hub/guides/cli#getting-started) package is installed and run the command below. Paste your User Access Token when prompted to log in.
+
+```bash
+huggingface-cli login
+```
+
+</hfoption>
+</hfoptions>
+
 Install a machine learning framework.

 <hfoptions id="installation">
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@ -170,7 +170,7 @@ Unlike other data collators, this specific data collator needs to apply a differ
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"

-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # split inputs and labels since they have to be of different lengths and need
 ...         # different padding methods
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@ -243,7 +243,7 @@ and it uses the exact same dataset as an example. Apply some geometric and color
 ... )
 ```

-The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`,
+The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': list[Dict]}`,
 where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:

 ```py
@ -252,9 +252,9 @@ The `image_processor` expects the annotations to be in the following format: `{'

 ...     Args:
 ...         image_id (str): image id. e.g. "0001"
-...         categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
-...         areas (List[float]): list of corresponding areas to provided bounding boxes
-...         bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+...         categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
+...         areas (list[float]): list of corresponding areas to provided bounding boxes
+...         bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
 ...             ([center_x, center_y, width, height] in absolute coordinates)

 ...     Returns:
@ -397,7 +397,7 @@ Intermediate format of boxes used for training is `YOLO` (normalized) but we wil

 ...     Args:
 ...         boxes (torch.Tensor): Bounding boxes in YOLO format
-...         image_size (Tuple[int, int]): Image size in format (height, width)
+...         image_size (tuple[int, int]): Image size in format (height, width)

 ...     Returns:
 ...         torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@ -408,7 +408,7 @@ instructs the model to ignore that part of the spectrogram when calculating the
 ... class TTSDataCollatorWithPadding:
 ...     processor: Any

-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
 ...         label_features = [{"input_values": feature["labels"]} for feature in features]
 ...         speaker_features = [feature["speaker_embeddings"] for feature in features]
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@ -474,13 +474,6 @@ For example, here is a test that must be run only when there are 2 or more GPUs
 def test_example_with_multi_gpu():
 ```

-If a test requires `tensorflow` use the `require_tf` decorator. For example:
-
-```python no-style
-@require_tf
-def test_tf_thing_with_tensorflow():
-```
-
 These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
 how to set it up:

@ -1226,11 +1219,6 @@ if torch.cuda.is_available():
 import numpy as np

 np.random.seed(seed)
-
-# tf RNG
-import tensorflow as tf 
-
-tf.random.set_seed(seed)
 ```

 ### Debugging tests
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -493,6 +493,33 @@ training_args = TrainingArguments(
 )
 ```

+You can also configure which specific kernels to apply using the `liger_kernel_config` parameter. This dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, allowing fine-grained control over kernel usage. Available options vary by model but typically include: `rope`, `swiglu`, `cross_entropy`, `fused_linear_cross_entropy`, `rms_norm`, etc.
+
+```py
+from transformers import TrainingArguments
+
+# Apply only specific kernels
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True,
+    liger_kernel_config={
+        "rope": True,
+        "cross_entropy": True,
+        "rms_norm": False,  # Don't apply Liger's RMSNorm kernel
+        "swiglu": True,
+    }
+)
+```
+
 ### NEFTune

 [NEFTune](https://hf.co/papers/2310.05914) adds noise to the embedding vectors during training to improve model performance. Enable it in [`Trainer`] with the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
--- a/docs/source/es/custom_models.md
+++ b/docs/source/es/custom_models.md
@ -48,7 +48,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/es/tasks/asr.md
+++ b/docs/source/es/tasks/asr.md
@ -166,7 +166,7 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"

-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
 ...         # requieren métodos de padding diferentes
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/Show More
+++ b/Show More