Gaudi3 CI (#38790)

2025-07-01 20:00:09 +06:00 · 2025-06-23 10:56:51 +02:00 · 2025-06-23 10:56:51 +02:00 · 984ff89e73
commit 984ff89e73
parent 2166b6b4ff
16 changed files with 618 additions and 14 deletions
--- a/.github/workflows/model_jobs_intel_gaudi.yml
+++ b/.github/workflows/model_jobs_intel_gaudi.yml
@ -0,0 +1,121 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      report_name_prefix:
+        required: false
+        default: run_models_gpu
+        type: string
+
+env:
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 8
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on:
+      group: ${{ inputs.runner }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ inputs.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all tests on Gaudi
+        run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -0,0 +1,345 @@
+name: Self-hosted runner (scheduled-intel-gaudi)
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      runner_scale_set:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string
+
+env:
+  NUM_SLICES: 2
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  setup:
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+    name: Setup
+    runs-on: ubuntu-latest
+    outputs:
+      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - id: set-matrix
+        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+        name: Identify models to test
+        working-directory: tests
+        run: |
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
+            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
+          fi
+
+      - id: set-matrix-quantization
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        name: Identify quantization method to test
+        working-directory: tests
+        run: |
+          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+    if: ${{ inputs.job == 'run_models_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_models_gpu
+
+    secrets: inherit
+
+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+
+    secrets: inherit
+
+  run_pipelines_gpu:
+    if: ${{ inputs.job == 'run_pipelines_gpu' }}
+    name: Pipelines
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on Intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run examples tests on Intel Gaudi
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
+
+  run_deepspeed_gpu:
+    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
+    name: Intel Gaudi deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all deepspeed tests on intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+
+  send_results:
+    name: Slack Report
+    needs:
+      [
+        setup,
+        run_models_gpu,
+        run_examples_gpu,
+        run_pipelines_gpu,
+        run_deepspeed_gpu,
+        run_trainer_and_fsdp_gpu,
+      ]
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+      ci_event: ${{ inputs.ci_event }}
+
+    secrets: inherit
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -0,0 +1,67 @@
+name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
+
+on:
+  repository_dispatch:
+  workflow_dispatch:
+  schedule:
+    - cron: "17 2 * * *"
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_models_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  pipeline-ci:
+    name: Pipeline CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_pipelines_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_examples_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_deepspeed_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+    secrets: inherit
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@ -23,7 +23,7 @@ from tqdm import tqdm

 from ...models.bert.tokenization_bert import whitespace_tokenize
 from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import is_tf_available, is_torch_available, logging
+from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
 from .utils import DataProcessor


@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
        is_training=not evaluate,
    )
    ```"""
-    # Defining helper methods
-    features = []

-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+    if not is_torch_hpu_available():
+        threads = min(threads, cpu_count())
+        with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+            annotate_ = partial(
+                squad_convert_example_to_features,
+                max_seq_length=max_seq_length,
+                doc_stride=doc_stride,
+                max_query_length=max_query_length,
+                padding_strategy=padding_strategy,
+                is_training=is_training,
+            )
+            features = list(
+                tqdm(
+                    p.imap(annotate_, examples, chunksize=32),
+                    total=len(examples),
+                    desc="convert squad examples to features",
+                    disable=not tqdm_enabled,
+                )
+            )
+    else:
+        # Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
+        squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
        annotate_ = partial(
            squad_convert_example_to_features,
            max_seq_length=max_seq_length,
@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
        )
        features = list(
            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
+                map(annotate_, examples),
                total=len(examples),
                desc="convert squad examples to features",
                disable=not tqdm_enabled,
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):

 def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
    if device not in dispatch_table:
+        if not callable(dispatch_table["default"]):
+            return dispatch_table["default"]
+
        return dispatch_table["default"](*args, **kwargs)

    fn = dispatch_table[device]
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -815,8 +815,8 @@ def is_torch_hpu_available():
    ):
        return False

-    torch_hpu_min_version = "1.5.0"
-    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
+    torch_hpu_min_accelerate_version = "1.5.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
        return False

    import torch
@ -850,6 +850,24 @@ def is_torch_hpu_available():

        torch.Tensor.masked_fill_ = patched_masked_fill_

+    # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
+    # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
+    # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
+    # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
+    original_compile = torch.compile
+
+    def hpu_backend_compile(*args, **kwargs):
+        if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
+            logger.warning(
+                f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
+                "We will override the backend with 'hpu_backend' to avoid errors."
+            )
+            kwargs["backend"] = "hpu_backend"
+
+        return original_compile(*args, **kwargs)
+
+    torch.compile = hpu_backend_compile
+
    return True


--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
+    @run_first
    def test_basic_distributed(self, stage, dtype):
        self.run_and_check(stage=stage, dtype=dtype, distributed=True)

    @require_torch_fp16
+    @run_first
    def test_do_eval_no_train(self):
        # testing only zero3 since zero2 makes no sense with inference
        self.run_and_check(
@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_fp32_non_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
+    @run_first
    def test_fp32_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
        # the saved model dir
@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(["bf16", "fp16", "fp32"])
    @require_torch_multi_accelerator
+    @run_first
    def test_inference(self, dtype):
        if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
            self.skipTest(reason="test requires bfloat16 hardware support")
@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        return output_dir

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_clm(self, stage, dtype):
        # this test exercises model.resize_token_embeddings() which requires param gathering outside
        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        execute_subprocess_async(cmd, env=self.get_env())

    @require_torch_fp16
+    @run_first
    def test_clm_from_config_zero3_fp16(self):
        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called

--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@ -28,6 +28,7 @@ from transformers.testing_utils import (
    get_tests_dir,
    require_deepspeed,
    require_torch_accelerator,
+    run_first,
    slow,
    torch_device,
 )
@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))


@slow
+@run_first
@require_deepspeed
@require_torch_accelerator
 class TestDeepSpeedModelZoo(TestCasePlus):
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
            raise AssertionError("CPU offloading failed with FSDP!")

    @require_torch_multi_accelerator
+    @run_first
    @slow
    @require_fsdp_v2_version
    @require_accelerate_fsdp2
@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
                self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)

    @require_torch_multi_accelerator
+    @run_first
    @slow
    @require_fsdp
    @require_fsdp_v2_version
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -84,6 +84,7 @@ from transformers.testing_utils import (
    require_bitsandbytes,
    require_deepspeed,
    require_flash_attn,
+    require_non_hpu,
    require_safetensors,
    require_torch,
    require_torch_accelerator,
@ -92,6 +93,7 @@ from transformers.testing_utils import (
    require_torch_multi_accelerator,
    require_torch_multi_gpu,
    require_torch_sdpa,
+    run_first,
    run_test_using_subprocess,
    set_config_for_less_flaky_test,
    set_model_for_less_flaky_test,
@ -2797,6 +2799,7 @@ class ModelTesterMixin:
                    else:
                        torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)

+    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator
@ -3727,6 +3730,9 @@ class ModelTesterMixin:
                if torch_device in ["cpu", "cuda"]:
                    atol = atols[torch_device, enable_kernels, torch_dtype]
                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                elif torch_device == "hpu":
+                    atol = atols["cuda", enable_kernels, torch_dtype]
+                    rtol = rtols["cuda", enable_kernels, torch_dtype]
                elif torch_device == "xpu":
                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
                    # which is implemented on PyTorch level using aten operators and is
@ -4666,6 +4672,7 @@ class ModelTesterMixin:

    # Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
    # may bring unwanted consequences on other tests. See PR #37553
+    @run_first
    @run_test_using_subprocess
    @require_torch_accelerator
    def test_can_load_with_global_device_set(self):
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    # the test slower.
    @require_torch_non_multi_accelerator
    @run_test_using_subprocess
+    @run_first
    @slow
    def test_can_resume_training_lm(self):
        # Check if it works for a simple language modeling example
@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                )

    @slow
-    @run_first
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertLess(result["eval_loss"], 0.2)

    @slow
-    @run_first
    def test_trainer_eval_multiple(self):
        MODEL_ID = "openai-community/gpt2"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)

    @slow
+    @run_first
    @require_non_hpu
    @require_torch_multi_accelerator
    def test_end_to_end_example(self):
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@ -22,6 +22,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )
 from transformers.training_args import ParallelMode
@ -116,6 +117,7 @@ if is_torch_available():


 class TestTrainerDistributed(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@ -199,8 +201,7 @@ if __name__ == "__main__":
    model = RegressionModel()
    training_args.per_device_train_batch_size = 1
    training_args.max_steps = 1
-    training_args.accelerator_config = {
-        "dispatch_batches": False,
-    }
+    training_args.accelerator_config.dispatch_batches = False
+
    trainer = Trainer(model, training_args, train_dataset=train_dataset)
    trainer.train()
--- a/tests/trainer/test_trainer_distributed_loss.py
+++ b/tests/trainer/test_trainer_distributed_loss.py
@ -18,11 +18,13 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )


 class TestTrainerDistributedLoss(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/trainer/test_trainer_distributed_worker_seed.py
+++ b/tests/trainer/test_trainer_distributed_worker_seed.py
@ -18,6 +18,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )

@ -57,6 +58,7 @@ class DummyModel(nn.Module):


 class TestTrainerDistributedWorkerSeed(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@ -58,6 +58,7 @@ from transformers.testing_utils import (
    is_staging_test,
    require_accelerate,
    require_flax,
+    require_non_hpu,
    require_read_token,
    require_safetensors,
    require_tf,
@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):

        self.assertIsNotNone(model)

+    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator
--- a/utils/print_env.py
+++ b/utils/print_env.py
@ -21,7 +21,7 @@ import os
 import sys

 import transformers
-from transformers import is_torch_xpu_available
+from transformers import is_torch_hpu_available, is_torch_xpu_available


 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@ -38,6 +38,9 @@ try:
        accelerator = "CUDA"
    elif is_torch_xpu_available():
        accelerator = "XPU"
+    elif is_torch_hpu_available():
+        accelerator = "HPU"
+
    print("Torch accelerator:", accelerator)

    if accelerator == "CUDA":
@ -48,6 +51,9 @@ try:
    elif accelerator == "XPU":
        print("SYCL version:", torch.version.xpu)
        print("Number of XPUs available:", torch.xpu.device_count())
+    elif accelerator == "HPU":
+        print("HPU version:", torch.__version__.split("+")[-1])
+        print("Number of HPUs available:", torch.hpu.device_count())
 except ImportError:
    print("Torch version:", None)