new failure CI reports for all jobs (#38298)

* new failures * report_repo_id * report_repo_id * report_repo_id * More fixes * More fixes * More fixes * ruff --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-03 04:40:06 +06:00 · 2025-05-24 19:15:02 +02:00 · 2025-05-24 19:15:02 +02:00 · d0c9c66d1c
commit d0c9c66d1c
parent 31f8a0fe8a
11 changed files with 248 additions and 121 deletions
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -9,6 +9,18 @@ on:
      start_sha:
        required: true
        type: string
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string


 env:
@ -26,7 +38,7 @@ env:


 jobs:
-  run_models_gpu:
+  check_new_failures:
    name: " "
    runs-on:
      group: aws-g4dn-4xlarge-cache
@ -36,17 +48,17 @@ jobs:
    steps:
      - uses: actions/download-artifact@v4
        with:
-          name: ci_results_run_models_gpu
-          path: /transformers/ci_results_run_models_gpu
+          name: ci_results_${{ inputs.job }}
+          path: /transformers/ci_results_${{ inputs.job }}

      - name: Check file
        working-directory: /transformers
        run: |
-          if [ -f ci_results_run_models_gpu/new_model_failures.json ]; then
-            echo "`ci_results_run_models_gpu/new_model_failures.json` exists, continue ..."
+          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
          else
-            echo "`ci_results_run_models_gpu/new_model_failures.json` doesn't exist, abort."
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
          fi

@ -112,14 +124,14 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_model_failures_with_bad_commit.json
-          cat new_model_failures_with_bad_commit.json
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json

      - name: Checkout back
        working-directory: /transformers
@ -134,6 +146,8 @@ jobs:
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+          JOB_NAME: ${{ inputs.job }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
          python3 utils/process_bad_commit_report.py

@ -144,6 +158,8 @@ jobs:
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+          JOB_NAME: ${{ inputs.job }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
          {
            echo 'REPORT_TEXT<<EOF'
@ -151,17 +167,31 @@ jobs:
            echo EOF
          } >> "$GITHUB_ENV"

+      - name: Prepare Slack report title
+        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
+        run: |
+          pip install slack_sdk
+          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+
      - name: Send processed report
        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: '#transformers-ci-feedback-tests'
+          channel-id: '#${{ inputs.slack_report_channel }}'
          # For posting a rich message using Block Kit
          payload: |
            {
              "blocks": [
+                {
+                  "type": "header",
+                  "text": {
+                    "type": "plain_text",
+                    "text": "${{ env.title }}"
+                  }
+                },
                {
                  "type": "section",
                  "text": {
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -19,6 +19,7 @@ jobs:
      runner: mi210
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi210
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  torch-pipeline:
@ -30,6 +31,7 @@ jobs:
      runner: mi210
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi210
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  example-ci:
@ -41,6 +43,7 @@ jobs:
      runner: mi210
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi210
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  deepspeed-ci:
@ -52,4 +55,5 @@ jobs:
      runner: mi210
      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
      ci_event: Scheduled CI (AMD) - mi210
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -19,6 +19,7 @@ jobs:
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  torch-pipeline:
@ -30,6 +31,7 @@ jobs:
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  example-ci:
@ -41,6 +43,7 @@ jobs:
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  deepspeed-ci:
@ -52,4 +55,5 @@ jobs:
      runner: mi250
      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -54,6 +54,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  torch-pipeline:
@ -65,6 +66,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  tf-pipeline:
@ -76,6 +78,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-tensorflow-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  example-ci:
@ -87,6 +90,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  trainer-fsdp-ci:
@ -98,6 +102,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  deepspeed-ci:
@ -110,6 +115,7 @@ jobs:
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  quantization-ci:
@ -121,4 +127,5 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-quantization-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -28,6 +28,10 @@ on:
        default: ''
        required: false
        type: string
+      report_repo_id:
+        required: true
+        type: string
+

 env:
  HF_HOME: /mnt/cache
@ -584,15 +588,22 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}

    secrets: inherit

-  check_new_model_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }}
-    name: Check new model failures
+  check_new_failures:
+    # TODO: work on `run_quantization_torch_gpu`
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job != 'run_quantization_torch_gpu' && needs.send_results.result == 'success' }}
+    name: Check new failures
    needs: send_results
-    uses: ./.github/workflows/check_failed_model_tests.yml
+    uses: ./.github/workflows/check_failed_tests.yml
    with:
      docker: ${{ inputs.docker }}
      start_sha: ${{ github.sha }}
+      job: ${{ inputs.job }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+
    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -21,6 +21,9 @@ on:
      ci_event:
        required: true
        type: string
+      report_repo_id:
+        required: true
+        type: string

 env:
  TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -67,6 +70,7 @@ jobs:
          CI_SHA: ${{ github.sha }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@ -96,6 +100,7 @@ jobs:
          CI_SHA: ${{ github.sha }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
        # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
        run: |
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@ -39,13 +39,16 @@ import os
 import subprocess

 result = subprocess.run(
-    ["python3", "-m", "pytest", "-v", f"{target_test}"],
+    ["python3", "-m", "pytest", "-v", "-rfEp", f"{target_test}"],
    capture_output = True,
    text=True,
 )
 print(result.stdout)

-if len(result.stderr) > 0:
+if f"PASSED {target_test}" in result.stdout:
+    print("test passed")
+    exit(0)
+elif len(result.stderr) > 0:
    if "ERROR: file or directory not found: " in result.stderr:
        print("test file or directory not found in this commit")
        exit(0)
--- a/utils/get_previous_daily_ci.py
+++ b/utils/get_previous_daily_ci.py
@ -28,11 +28,15 @@ def get_daily_ci_runs(token, num_runs=7, workflow_id=None):

    url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
    # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
-    url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
+    url += f"?branch=main&exclude_pull_requests=true&per_page={num_runs}"

-    result = requests.get(url, headers=headers).json()
+    result = requests.get(f"{url}&event=schedule", headers=headers).json()
+    workflow_runs = result["workflow_runs"]
+    if len(workflow_runs) == 0:
+        result = requests.get(f"{url}&event=workflow_run", headers=headers).json()
+        workflow_runs = result["workflow_runs"]

-    return result["workflow_runs"]
+    return workflow_runs


 def get_last_daily_ci_run(token, workflow_run_id=None, workflow_id=None, commit_sha=None):
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@ -30,8 +30,17 @@ from huggingface_hub import HfApi
 from slack_sdk import WebClient


-api = HfApi()
-client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
+# A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of
+# `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a
+# specific job. See below.
+job_to_test_map = {
+    "run_models_gpu": "Models",
+    "run_trainer_and_fsdp_gpu": "Trainer & FSDP",
+    "run_pipelines_torch_gpu": "PyTorch pipelines",
+    "run_pipelines_tf_gpu": "TensorFlow pipelines",
+    "run_examples_gpu": "Examples directory",
+    "run_torch_cuda_extensions_gpu": "DeepSpeed",
+}

 NON_MODEL_TEST_MODULES = [
    "deepspeed",
@ -516,6 +525,7 @@ class Message:
        if len(self.selected_warnings) > 0:
            blocks.append(self.warnings)

+        new_failure_blocks = []
        for idx, (prev_workflow_run_id, prev_ci_artifacts) in enumerate(
            [self.prev_ci_artifacts] + self.other_ci_artifacts
        ):
@ -524,13 +534,11 @@ class Message:
                new_failure_blocks = self.get_new_model_failure_blocks(
                    prev_ci_artifacts=prev_ci_artifacts, with_header=False
                )
-                if len(new_failure_blocks) > 0:
-                    blocks.extend(new_failure_blocks)

            # To save the list of new model failures and uploaed to hub repositories
            extra_blocks = self.get_new_model_failure_blocks(prev_ci_artifacts=prev_ci_artifacts, to_truncate=False)
            if extra_blocks:
-                filename = "new_model_failures"
+                filename = "new_failures"
                if idx > 0:
                    filename = f"{filename}_against_{prev_workflow_run_id}"

@ -541,17 +549,17 @@ class Message:

                # upload results to Hub dataset
                file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.txt")
-                commit_info = api.upload_file(
+                _ = api.upload_file(
                    path_or_fileobj=file_path,
                    path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.txt",
-                    repo_id="hf-internal-testing/transformers_daily_ci",
+                    repo_id=report_repo_id,
                    repo_type="dataset",
                    token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
                )
-                url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/{filename}.txt"

                # extra processing to save to json format
                new_failed_tests = {}
+                nb_new_failed_tests = 0
                for line in failure_text.split():
                    if "https://github.com/huggingface/transformers/actions/runs" in line:
                        pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>"
@ -563,36 +571,56 @@ class Message:
                            model = line.split("/")[1]
                        if model not in new_failed_tests:
                            new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []}
-                        for url, device in items:
+                        for _, device in items:
                            new_failed_tests[model][f"{device}-gpu"].append(line)
+                            nb_new_failed_tests += 1
                file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json")
                with open(file_path, "w", encoding="UTF-8") as fp:
                    json.dump(new_failed_tests, fp, ensure_ascii=False, indent=4)

                # upload results to Hub dataset
                file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json")
-                _ = api.upload_file(
+                commit_info = api.upload_file(
                    path_or_fileobj=file_path,
                    path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.json",
-                    repo_id="hf-internal-testing/transformers_daily_ci",
+                    repo_id=report_repo_id,
                    repo_type="dataset",
                    token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
                )
+                new_failures_url = f"https://huggingface.co/datasets/{report_repo_id}/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/{filename}.json"

                if idx == 0:
                    block = {
                        "type": "section",
                        "text": {
-                            "type": "plain_text",
-                            "text": " ",
+                            "type": "mrkdwn",
+                            "text": f"*There are {nb_new_failed_tests} new failed tests*\n\n(compared to previous run: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
                        },
                        "accessory": {
                            "type": "button",
-                            "text": {"type": "plain_text", "text": "Check New model failures"},
-                            "url": url,
+                            "text": {"type": "plain_text", "text": "Check new failures"},
+                            "url": new_failures_url,
                        },
                    }
                    blocks.append(block)
+                else:
+                    block = {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment.
+                            "text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to Nvidia CI: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
+                        },
+                        "accessory": {
+                            "type": "button",
+                            "text": {"type": "plain_text", "text": "Check failures"},
+                            "url": new_failures_url,
+                        },
+                    }
+                    blocks.append(block)
+
+        if len(new_failure_blocks) > 0:
+            blocks.extend(new_failure_blocks)

        return json.dumps(blocks)

@ -717,14 +745,28 @@ class Message:
        if prev_ci_artifacts is None:
            return []

-        sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0])
+        if len(self.model_results) > 0:
+            target_results = self.model_results
+        else:
+            target_results = self.additional_results[job_to_test_map[job_name]]

+        # Make the format uniform between `model_results` and `additional_results[XXX]`
+        if "failures" in target_results:
+            target_results = {job_name: target_results}
+        sorted_dict = sorted(target_results.items(), key=lambda t: t[0])
+
+        job = job_to_test_map[job_name]
        prev_model_results = {}
        if (
            f"ci_results_{job_name}" in prev_ci_artifacts
-            and "model_results.json" in prev_ci_artifacts[f"ci_results_{job_name}"]
+            and f"{test_to_result_name[job]}_results.json" in prev_ci_artifacts[f"ci_results_{job_name}"]
        ):
-            prev_model_results = json.loads(prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"])
+            prev_model_results = json.loads(
+                prev_ci_artifacts[f"ci_results_{job_name}"][f"{test_to_result_name[job]}_results.json"]
+            )
+            # Make the format uniform between `model_results` and `additional_results[XXX]`
+            if "failures" in prev_model_results:
+                prev_model_results = {job_name: prev_model_results}

        all_failure_lines = {}
        for job, job_result in sorted_dict:
@ -751,7 +793,7 @@ class Message:

                        all_failure_lines[new_text].append(f"<{url}|{device}>" if url is not None else device)

-        MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New model failures```\n\n")
+        MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New failures```\n\n")
        if not to_truncate:
            MAX_ERROR_TEXT = float("inf")
        failure_text = ""
@ -768,10 +810,10 @@ class Message:
        if failure_text:
            if with_header:
                blocks.append(
-                    {"type": "header", "text": {"type": "plain_text", "text": "New model failures", "emoji": True}}
+                    {"type": "header", "text": {"type": "plain_text", "text": "New failures", "emoji": True}}
                )
            else:
-                failure_text = f"*New model failures*\n\n{failure_text}"
+                failure_text = f"{failure_text}"
            blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": failure_text}})

        return blocks
@ -927,6 +969,9 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:


 if __name__ == "__main__":
+    api = HfApi()
+    client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
+
    SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"]

    # runner_status = os.environ.get("RUNNER_STATUS")
@ -1157,15 +1202,7 @@ if __name__ == "__main__":
    elif ci_event.startswith("Push CI (AMD)"):
        additional_files = {}

-    # A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of
-    # `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a
-    # specific job. See below.
-    job_to_test_map = {
-        "run_pipelines_torch_gpu": "PyTorch pipelines",
-        "run_pipelines_tf_gpu": "TensorFlow pipelines",
-        "run_examples_gpu": "Examples directory",
-        "run_torch_cuda_extensions_gpu": "DeepSpeed",
-    }
+    report_repo_id = os.getenv("REPORT_REPO_ID")

    # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder`
    report_repo_subfolder = ""
@ -1258,81 +1295,100 @@ if __name__ == "__main__":
        os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))

    nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml"
+    amd_daily_ci_workflows = (
+        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi210-caller.yml",
+        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi250-caller.yml",
+    )
    is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow)
+    is_amd_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(amd_daily_ci_workflows)
+
    is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule"
+    # For AMD workflow runs: the different AMD CI callers (MI210/MI250/MI300, etc.) are triggered by `workflow_run`
+    #  event of `.github/workflows/self-scheduled-amd-caller.yml`.
+    if is_amd_daily_ci_workflow:
+        # Get the path to the file on the runner that contains the full event webhook payload.
+        event_payload_path = os.environ.get("GITHUB_EVENT_PATH")
+        # Load the event payload
+        with open(event_payload_path) as fp:
+            event_payload = json.load(fp)
+            # The event that triggers the `workflow_run` event.
+            if "workflow_run" in event_payload:
+                is_scheduled_ci_run = event_payload["event"] == "schedule"

-    # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as
-    # results.
-    if job_name == "run_models_gpu":
-        with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp:
-            json.dump(model_results, fp, indent=4, ensure_ascii=False)
-
-        api.upload_file(
-            path_or_fileobj=f"ci_results_{job_name}/model_results.json",
-            path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_results.json",
-            repo_id="hf-internal-testing/transformers_daily_ci",
-            repo_type="dataset",
-            token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
-        )
-
-        # Let's create a file contain job --> job link
-        model_job_links = {}
-        sorted_dict = sorted(model_results.items(), key=lambda t: t[0])
-        for job, job_result in sorted_dict:
-            model_name = job
-            if model_name.startswith("models_"):
-                model_name = model_name[len("models_") :]
-            model_job_links[model_name] = job_result["job_link"]
-
-        with open(f"ci_results_{job_name}/model_job_links.json", "w", encoding="UTF-8") as fp:
-            json.dump(model_job_links, fp, indent=4, ensure_ascii=False)
-
-        api.upload_file(
-            path_or_fileobj=f"ci_results_{job_name}/model_job_links.json",
-            path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_job_links.json",
-            repo_id="hf-internal-testing/transformers_daily_ci",
-            repo_type="dataset",
-            token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
-        )
-
-    # Must have the same keys as in `additional_results`.
    # The values are used as the file names where to save the corresponding CI job results.
    test_to_result_name = {
+        "Models": "model",
+        "Trainer & FSDP": "trainer_and_fsdp",
        "PyTorch pipelines": "torch_pipeline",
        "TensorFlow pipelines": "tf_pipeline",
        "Examples directory": "example",
        "DeepSpeed": "deepspeed",
    }
-    for job, job_result in additional_results.items():
-        with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp:
-            json.dump(job_result, fp, indent=4, ensure_ascii=False)
+
+    test_name_and_result_pairs = []
+    if len(model_results) > 0:
+        test_name = job_to_test_map[job_name]
+        test_name_and_result_pairs.append((test_name, model_results))
+
+    for test_name, result in additional_results.items():
+        test_name_and_result_pairs.append((test_name, result))
+
+    for test_name, result in test_name_and_result_pairs:
+        with open(f"ci_results_{job_name}/{test_to_result_name[test_name]}_results.json", "w", encoding="UTF-8") as fp:
+            json.dump(result, fp, indent=4, ensure_ascii=False)

        api.upload_file(
-            path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[job]}_results.json",
-            path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{test_to_result_name[job]}_results.json",
-            repo_id="hf-internal-testing/transformers_daily_ci",
+            path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[test_name]}_results.json",
+            path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{test_to_result_name[test_name]}_results.json",
+            repo_id=report_repo_id,
            repo_type="dataset",
            token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
        )

+    # Let's create a file contain job --> job link
+    if len(model_results) > 0:
+        target_results = model_results
+    else:
+        target_results = additional_results[job_to_test_map[job_name]]
+
+    # Make the format uniform between `model_results` and `additional_results[XXX]`
+    if "failures" in target_results:
+        target_results = {job_name: target_results}
+
+    job_links = {}
+    sorted_dict = sorted(target_results.items(), key=lambda t: t[0])
+    for job, job_result in sorted_dict:
+        if job.startswith("models_"):
+            job = job[len("models_") :]
+        job_links[job] = job_result["job_link"]
+
+    with open(f"ci_results_{job_name}/job_links.json", "w", encoding="UTF-8") as fp:
+        json.dump(job_links, fp, indent=4, ensure_ascii=False)
+
+    api.upload_file(
+        path_or_fileobj=f"ci_results_{job_name}/job_links.json",
+        path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/job_links.json",
+        repo_id=report_repo_id,
+        repo_type="dataset",
+        token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
+    )
+
    prev_workflow_run_id = None
    other_workflow_run_ids = []

    if is_scheduled_ci_run:
-        # TODO: remove `if job_name == "run_models_gpu"`
-        if job_name == "run_models_gpu":
-            prev_workflow_run_id = get_last_daily_ci_workflow_run_id(
-                token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=workflow_id
+        prev_workflow_run_id = get_last_daily_ci_workflow_run_id(
+            token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=workflow_id
+        )
+        # For a scheduled run that is not the Nvidia's scheduled daily CI, add Nvidia's scheduled daily CI run as a target to compare.
+        if not is_nvidia_daily_ci_workflow:
+            # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it).
+            other_workflow_id = "90575235"
+            # We need to get the Nvidia's scheduled daily CI run that match the current run (i.e. run with the same commit SHA)
+            other_workflow_run_id = get_last_daily_ci_workflow_run_id(
+                token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha
            )
-            # For a scheduled run that is not the Nvidia's scheduled daily CI, add Nvidia's scheduled daily CI run as a target to compare.
-            if not is_nvidia_daily_ci_workflow:
-                # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it).
-                other_workflow_id = "90575235"
-                # We need to get the Nvidia's scheduled daily CI run that match the current run (i.e. run with the same commit SHA)
-                other_workflow_run_id = get_last_daily_ci_workflow_run_id(
-                    token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha
-                )
-                other_workflow_run_ids.append(other_workflow_run_id)
+            other_workflow_run_ids.append(other_workflow_run_id)
    else:
        prev_workflow_run_id = os.environ["PREV_WORKFLOW_RUN_ID"]
        other_workflow_run_id = os.environ["OTHER_WORKFLOW_RUN_ID"]
@ -1359,13 +1415,6 @@ if __name__ == "__main__":
            else:
                other_ci_artifacts.append((target_workflow_run_id, ci_artifacts))

-    job_to_test_map.update(
-        {
-            "run_models_gpu": "Models",
-            "run_trainer_and_fsdp_gpu": "Trainer & FSDP",
-        }
-    )
-
    ci_name_in_report = ""
    if job_name in job_to_test_map:
        ci_name_in_report = job_to_test_map[job_name]
--- a/utils/notification_service_quantization.py
+++ b/utils/notification_service_quantization.py
@ -274,11 +274,13 @@ if __name__ == "__main__":
    with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp:
        json.dump(quantization_results, fp, indent=4, ensure_ascii=False)

+    report_repo_id = os.getenv("REPORT_REPO_ID")
+
    # upload results to Hub dataset (only for the scheduled daily CI run on `main`)
    api.upload_file(
        path_or_fileobj=f"ci_results_{job_name}/quantization_results.json",
        path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/quantization_results.json",
-        repo_id="hf-internal-testing/transformers_daily_ci",
+        repo_id=report_repo_id,
        repo_type="dataset",
        token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
    )
--- a/utils/process_bad_commit_report.py
+++ b/utils/process_bad_commit_report.py
@ -1,4 +1,4 @@
-"""An internal script to process `new_model_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`.
+"""An internal script to process `new_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`.

 This is used by `.github/workflows/check_failed_model_tests.yml` to produce a slack report of the following form

@ -24,11 +24,13 @@ from huggingface_hub import HfApi
 if __name__ == "__main__":
    api = HfApi()

-    with open("new_model_failures_with_bad_commit.json") as fp:
+    job_name = os.environ.get("JOB_NAME")
+
+    with open("new_failures_with_bad_commit.json") as fp:
        data = json.load(fp)

-    with open("ci_results_run_models_gpu/model_job_links.json") as fp:
-        model_job_links = json.load(fp)
+    with open(f"ci_results_{job_name}/job_links.json") as fp:
+        job_links = json.load(fp)

    # TODO: extend
    team_members = [
@ -67,7 +69,11 @@ if __name__ == "__main__":
            for device, failed_tests in model_result.items():
                # prepare job_link and add it to each entry of new failed test information.
                # need to change from `single-gpu` to `single` and same for `multi-gpu` to match `job_link`.
-                job_link = model_job_links[model][device.replace("-gpu", "")]
+                key = model
+                if list(job_links.keys()) == [job_name]:
+                    key = job_name
+                job_link = job_links[key][device.replace("-gpu", "")]
+
                failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author]
                for x in failed_tests:
                    x.update({"job_link": job_link})
@ -92,16 +98,18 @@ if __name__ == "__main__":
    if report_repo_subfolder:
        report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}"

-    with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp:
+    report_repo_id = os.getenv("REPORT_REPO_ID")
+
+    with open("new_failures_with_bad_commit_grouped_by_authors.json", "w") as fp:
        json.dump(new_data_full, fp, ensure_ascii=False, indent=4)
    commit_info = api.upload_file(
-        path_or_fileobj="new_model_failures_with_bad_commit_grouped_by_authors.json",
-        path_in_repo=f"{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json",
-        repo_id="hf-internal-testing/transformers_daily_ci",
+        path_or_fileobj="new_failures_with_bad_commit_grouped_by_authors.json",
+        path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json",
+        repo_id=report_repo_id,
        repo_type="dataset",
        token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
    )
-    url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json"
+    url = f"https://huggingface.co/datasets/{report_repo_id}/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json"

    # Add `GH_` prefix as keyword mention
    output = {}