diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_tests.yml similarity index 76% rename from .github/workflows/check_failed_model_tests.yml rename to .github/workflows/check_failed_tests.yml index 653b50e4cf6..478f9d0ae2a 100644 --- a/.github/workflows/check_failed_model_tests.yml +++ b/.github/workflows/check_failed_tests.yml @@ -9,6 +9,18 @@ on: start_sha: required: true type: string + job: + required: true + type: string + slack_report_channel: + required: true + type: string + ci_event: + required: true + type: string + report_repo_id: + required: true + type: string env: @@ -26,7 +38,7 @@ env: jobs: - run_models_gpu: + check_new_failures: name: " " runs-on: group: aws-g4dn-4xlarge-cache @@ -36,17 +48,17 @@ jobs: steps: - uses: actions/download-artifact@v4 with: - name: ci_results_run_models_gpu - path: /transformers/ci_results_run_models_gpu + name: ci_results_${{ inputs.job }} + path: /transformers/ci_results_${{ inputs.job }} - name: Check file working-directory: /transformers run: | - if [ -f ci_results_run_models_gpu/new_model_failures.json ]; then - echo "`ci_results_run_models_gpu/new_model_failures.json` exists, continue ..." + if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then + echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..." echo "process=true" >> $GITHUB_ENV else - echo "`ci_results_run_models_gpu/new_model_failures.json` doesn't exist, abort." + echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort." echo "process=false" >> $GITHUB_ENV fi @@ -112,14 +124,14 @@ jobs: - name: Check failed tests working-directory: /transformers if: ${{ env.process == 'true' }} - run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json + run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json - name: Show results working-directory: /transformers if: ${{ env.process == 'true' }} run: | - ls -l new_model_failures_with_bad_commit.json - cat new_model_failures_with_bad_commit.json + ls -l new_failures_with_bad_commit.json + cat new_failures_with_bad_commit.json - name: Checkout back working-directory: /transformers @@ -134,6 +146,8 @@ jobs: env: ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + JOB_NAME: ${{ inputs.job }} + REPORT_REPO_ID: ${{ inputs.report_repo_id }} run: | python3 utils/process_bad_commit_report.py @@ -144,6 +158,8 @@ jobs: env: ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + JOB_NAME: ${{ inputs.job }} + REPORT_REPO_ID: ${{ inputs.report_repo_id }} run: | { echo 'REPORT_TEXT<> "$GITHUB_ENV" + - name: Prepare Slack report title + working-directory: /transformers + if: ${{ env.process == 'true' }} + run: | + pip install slack_sdk + echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV + - name: Send processed report if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }} uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 with: # Slack channel id, channel name, or user id to post message. # See also: https://api.slack.com/methods/chat.postMessage#channels - channel-id: '#transformers-ci-feedback-tests' + channel-id: '#${{ inputs.slack_report_channel }}' # For posting a rich message using Block Kit payload: | { "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "${{ env.title }}" + } + }, { "type": "section", "text": { diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index 6109faca009..8c75f453b48 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -19,6 +19,7 @@ jobs: runner: mi210 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi210 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit torch-pipeline: @@ -30,6 +31,7 @@ jobs: runner: mi210 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi210 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit example-ci: @@ -41,6 +43,7 @@ jobs: runner: mi210 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi210 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit deepspeed-ci: @@ -52,4 +55,5 @@ jobs: runner: mi210 docker: huggingface/transformers-pytorch-deepspeed-amd-gpu ci_event: Scheduled CI (AMD) - mi210 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index 4c6284a78cd..476fba31ee2 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -19,6 +19,7 @@ jobs: runner: mi250 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi250 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit torch-pipeline: @@ -30,6 +31,7 @@ jobs: runner: mi250 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi250 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit example-ci: @@ -41,6 +43,7 @@ jobs: runner: mi250 docker: huggingface/transformers-pytorch-amd-gpu ci_event: Scheduled CI (AMD) - mi250 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit deepspeed-ci: @@ -52,4 +55,5 @@ jobs: runner: mi250 docker: huggingface/transformers-pytorch-deepspeed-amd-gpu ci_event: Scheduled CI (AMD) - mi250 + report_repo_id: optimum-amd/transformers_daily_ci secrets: inherit diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 77b33850fe4..f48d357cd5d 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -54,6 +54,7 @@ jobs: runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit torch-pipeline: @@ -65,6 +66,7 @@ jobs: runner: daily-ci docker: huggingface/transformers-pytorch-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit tf-pipeline: @@ -76,6 +78,7 @@ jobs: runner: daily-ci docker: huggingface/transformers-tensorflow-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit example-ci: @@ -87,6 +90,7 @@ jobs: runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit trainer-fsdp-ci: @@ -98,6 +102,7 @@ jobs: runner: daily-ci docker: huggingface/transformers-all-latest-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit deepspeed-ci: @@ -110,6 +115,7 @@ jobs: docker: huggingface/transformers-pytorch-deepspeed-latest-gpu ci_event: Daily CI working-directory-prefix: /workspace + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit quantization-ci: @@ -121,4 +127,5 @@ jobs: runner: daily-ci docker: huggingface/transformers-quantization-latest-gpu ci_event: Daily CI + report_repo_id: hf-internal-testing/transformers_daily_ci secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 1198148fd63..5fc037fec20 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -28,6 +28,10 @@ on: default: '' required: false type: string + report_repo_id: + required: true + type: string + env: HF_HOME: /mnt/cache @@ -584,15 +588,22 @@ jobs: folder_slices: ${{ needs.setup.outputs.folder_slices }} quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} ci_event: ${{ inputs.ci_event }} + report_repo_id: ${{ inputs.report_repo_id }} secrets: inherit - check_new_model_failures: - if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }} - name: Check new model failures + check_new_failures: + # TODO: work on `run_quantization_torch_gpu` + if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job != 'run_quantization_torch_gpu' && needs.send_results.result == 'success' }} + name: Check new failures needs: send_results - uses: ./.github/workflows/check_failed_model_tests.yml + uses: ./.github/workflows/check_failed_tests.yml with: docker: ${{ inputs.docker }} start_sha: ${{ github.sha }} + job: ${{ inputs.job }} + slack_report_channel: ${{ inputs.slack_report_channel }} + ci_event: ${{ inputs.ci_event }} + report_repo_id: ${{ inputs.report_repo_id }} + secrets: inherit diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index bea113ca031..c6aa336e8f4 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -21,6 +21,9 @@ on: ci_event: required: true type: string + report_repo_id: + required: true + type: string env: TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} @@ -67,6 +70,7 @@ jobs: CI_SHA: ${{ github.sha }} CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} + REPORT_REPO_ID: ${{ inputs.report_repo_id }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an @@ -96,6 +100,7 @@ jobs: CI_SHA: ${{ github.sha }} CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} + REPORT_REPO_ID: ${{ inputs.report_repo_id }} # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index 5d21b1c4651..a251b954150 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -39,13 +39,16 @@ import os import subprocess result = subprocess.run( - ["python3", "-m", "pytest", "-v", f"{target_test}"], + ["python3", "-m", "pytest", "-v", "-rfEp", f"{target_test}"], capture_output = True, text=True, ) print(result.stdout) -if len(result.stderr) > 0: +if f"PASSED {target_test}" in result.stdout: + print("test passed") + exit(0) +elif len(result.stderr) > 0: if "ERROR: file or directory not found: " in result.stderr: print("test file or directory not found in this commit") exit(0) diff --git a/utils/get_previous_daily_ci.py b/utils/get_previous_daily_ci.py index c9248facf91..83828d645de 100644 --- a/utils/get_previous_daily_ci.py +++ b/utils/get_previous_daily_ci.py @@ -28,11 +28,15 @@ def get_daily_ci_runs(token, num_runs=7, workflow_id=None): url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs" # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results - url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}" + url += f"?branch=main&exclude_pull_requests=true&per_page={num_runs}" - result = requests.get(url, headers=headers).json() + result = requests.get(f"{url}&event=schedule", headers=headers).json() + workflow_runs = result["workflow_runs"] + if len(workflow_runs) == 0: + result = requests.get(f"{url}&event=workflow_run", headers=headers).json() + workflow_runs = result["workflow_runs"] - return result["workflow_runs"] + return workflow_runs def get_last_daily_ci_run(token, workflow_run_id=None, workflow_id=None, commit_sha=None): diff --git a/utils/notification_service.py b/utils/notification_service.py index 407ee47e592..96f4370da57 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -30,8 +30,17 @@ from huggingface_hub import HfApi from slack_sdk import WebClient -api = HfApi() -client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) +# A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of +# `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a +# specific job. See below. +job_to_test_map = { + "run_models_gpu": "Models", + "run_trainer_and_fsdp_gpu": "Trainer & FSDP", + "run_pipelines_torch_gpu": "PyTorch pipelines", + "run_pipelines_tf_gpu": "TensorFlow pipelines", + "run_examples_gpu": "Examples directory", + "run_torch_cuda_extensions_gpu": "DeepSpeed", +} NON_MODEL_TEST_MODULES = [ "deepspeed", @@ -516,6 +525,7 @@ class Message: if len(self.selected_warnings) > 0: blocks.append(self.warnings) + new_failure_blocks = [] for idx, (prev_workflow_run_id, prev_ci_artifacts) in enumerate( [self.prev_ci_artifacts] + self.other_ci_artifacts ): @@ -524,13 +534,11 @@ class Message: new_failure_blocks = self.get_new_model_failure_blocks( prev_ci_artifacts=prev_ci_artifacts, with_header=False ) - if len(new_failure_blocks) > 0: - blocks.extend(new_failure_blocks) # To save the list of new model failures and uploaed to hub repositories extra_blocks = self.get_new_model_failure_blocks(prev_ci_artifacts=prev_ci_artifacts, to_truncate=False) if extra_blocks: - filename = "new_model_failures" + filename = "new_failures" if idx > 0: filename = f"{filename}_against_{prev_workflow_run_id}" @@ -541,17 +549,17 @@ class Message: # upload results to Hub dataset file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.txt") - commit_info = api.upload_file( + _ = api.upload_file( path_or_fileobj=file_path, path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.txt", - repo_id="hf-internal-testing/transformers_daily_ci", + repo_id=report_repo_id, repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) - url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/{filename}.txt" # extra processing to save to json format new_failed_tests = {} + nb_new_failed_tests = 0 for line in failure_text.split(): if "https://github.com/huggingface/transformers/actions/runs" in line: pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>" @@ -563,36 +571,56 @@ class Message: model = line.split("/")[1] if model not in new_failed_tests: new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} - for url, device in items: + for _, device in items: new_failed_tests[model][f"{device}-gpu"].append(line) + nb_new_failed_tests += 1 file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json") with open(file_path, "w", encoding="UTF-8") as fp: json.dump(new_failed_tests, fp, ensure_ascii=False, indent=4) # upload results to Hub dataset file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json") - _ = api.upload_file( + commit_info = api.upload_file( path_or_fileobj=file_path, path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.json", - repo_id="hf-internal-testing/transformers_daily_ci", + repo_id=report_repo_id, repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) + new_failures_url = f"https://huggingface.co/datasets/{report_repo_id}/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/{filename}.json" if idx == 0: block = { "type": "section", "text": { - "type": "plain_text", - "text": " ", + "type": "mrkdwn", + "text": f"*There are {nb_new_failed_tests} new failed tests*\n\n(compared to previous run: )", }, "accessory": { "type": "button", - "text": {"type": "plain_text", "text": "Check New model failures"}, - "url": url, + "text": {"type": "plain_text", "text": "Check new failures"}, + "url": new_failures_url, }, } blocks.append(block) + else: + block = { + "type": "section", + "text": { + "type": "mrkdwn", + # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment. + "text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to Nvidia CI: )", + }, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check failures"}, + "url": new_failures_url, + }, + } + blocks.append(block) + + if len(new_failure_blocks) > 0: + blocks.extend(new_failure_blocks) return json.dumps(blocks) @@ -717,14 +745,28 @@ class Message: if prev_ci_artifacts is None: return [] - sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) + if len(self.model_results) > 0: + target_results = self.model_results + else: + target_results = self.additional_results[job_to_test_map[job_name]] + # Make the format uniform between `model_results` and `additional_results[XXX]` + if "failures" in target_results: + target_results = {job_name: target_results} + sorted_dict = sorted(target_results.items(), key=lambda t: t[0]) + + job = job_to_test_map[job_name] prev_model_results = {} if ( f"ci_results_{job_name}" in prev_ci_artifacts - and "model_results.json" in prev_ci_artifacts[f"ci_results_{job_name}"] + and f"{test_to_result_name[job]}_results.json" in prev_ci_artifacts[f"ci_results_{job_name}"] ): - prev_model_results = json.loads(prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"]) + prev_model_results = json.loads( + prev_ci_artifacts[f"ci_results_{job_name}"][f"{test_to_result_name[job]}_results.json"] + ) + # Make the format uniform between `model_results` and `additional_results[XXX]` + if "failures" in prev_model_results: + prev_model_results = {job_name: prev_model_results} all_failure_lines = {} for job, job_result in sorted_dict: @@ -751,7 +793,7 @@ class Message: all_failure_lines[new_text].append(f"<{url}|{device}>" if url is not None else device) - MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New model failures```\n\n") + MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New failures```\n\n") if not to_truncate: MAX_ERROR_TEXT = float("inf") failure_text = "" @@ -768,10 +810,10 @@ class Message: if failure_text: if with_header: blocks.append( - {"type": "header", "text": {"type": "plain_text", "text": "New model failures", "emoji": True}} + {"type": "header", "text": {"type": "plain_text", "text": "New failures", "emoji": True}} ) else: - failure_text = f"*New model failures*\n\n{failure_text}" + failure_text = f"{failure_text}" blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": failure_text}}) return blocks @@ -927,6 +969,9 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any: if __name__ == "__main__": + api = HfApi() + client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) + SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"] # runner_status = os.environ.get("RUNNER_STATUS") @@ -1157,15 +1202,7 @@ if __name__ == "__main__": elif ci_event.startswith("Push CI (AMD)"): additional_files = {} - # A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of - # `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a - # specific job. See below. - job_to_test_map = { - "run_pipelines_torch_gpu": "PyTorch pipelines", - "run_pipelines_tf_gpu": "TensorFlow pipelines", - "run_examples_gpu": "Examples directory", - "run_torch_cuda_extensions_gpu": "DeepSpeed", - } + report_repo_id = os.getenv("REPORT_REPO_ID") # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder` report_repo_subfolder = "" @@ -1258,81 +1295,100 @@ if __name__ == "__main__": os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml" + amd_daily_ci_workflows = ( + "huggingface/transformers/.github/workflows/self-scheduled-amd-mi210-caller.yml", + "huggingface/transformers/.github/workflows/self-scheduled-amd-mi250-caller.yml", + ) is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow) + is_amd_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(amd_daily_ci_workflows) + is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule" + # For AMD workflow runs: the different AMD CI callers (MI210/MI250/MI300, etc.) are triggered by `workflow_run` + # event of `.github/workflows/self-scheduled-amd-caller.yml`. + if is_amd_daily_ci_workflow: + # Get the path to the file on the runner that contains the full event webhook payload. + event_payload_path = os.environ.get("GITHUB_EVENT_PATH") + # Load the event payload + with open(event_payload_path) as fp: + event_payload = json.load(fp) + # The event that triggers the `workflow_run` event. + if "workflow_run" in event_payload: + is_scheduled_ci_run = event_payload["event"] == "schedule" - # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as - # results. - if job_name == "run_models_gpu": - with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp: - json.dump(model_results, fp, indent=4, ensure_ascii=False) - - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/model_results.json", - path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) - - # Let's create a file contain job --> job link - model_job_links = {} - sorted_dict = sorted(model_results.items(), key=lambda t: t[0]) - for job, job_result in sorted_dict: - model_name = job - if model_name.startswith("models_"): - model_name = model_name[len("models_") :] - model_job_links[model_name] = job_result["job_link"] - - with open(f"ci_results_{job_name}/model_job_links.json", "w", encoding="UTF-8") as fp: - json.dump(model_job_links, fp, indent=4, ensure_ascii=False) - - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/model_job_links.json", - path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_job_links.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) - - # Must have the same keys as in `additional_results`. # The values are used as the file names where to save the corresponding CI job results. test_to_result_name = { + "Models": "model", + "Trainer & FSDP": "trainer_and_fsdp", "PyTorch pipelines": "torch_pipeline", "TensorFlow pipelines": "tf_pipeline", "Examples directory": "example", "DeepSpeed": "deepspeed", } - for job, job_result in additional_results.items(): - with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp: - json.dump(job_result, fp, indent=4, ensure_ascii=False) + + test_name_and_result_pairs = [] + if len(model_results) > 0: + test_name = job_to_test_map[job_name] + test_name_and_result_pairs.append((test_name, model_results)) + + for test_name, result in additional_results.items(): + test_name_and_result_pairs.append((test_name, result)) + + for test_name, result in test_name_and_result_pairs: + with open(f"ci_results_{job_name}/{test_to_result_name[test_name]}_results.json", "w", encoding="UTF-8") as fp: + json.dump(result, fp, indent=4, ensure_ascii=False) api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", - path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{test_to_result_name[job]}_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", + path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[test_name]}_results.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{test_to_result_name[test_name]}_results.json", + repo_id=report_repo_id, repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) + # Let's create a file contain job --> job link + if len(model_results) > 0: + target_results = model_results + else: + target_results = additional_results[job_to_test_map[job_name]] + + # Make the format uniform between `model_results` and `additional_results[XXX]` + if "failures" in target_results: + target_results = {job_name: target_results} + + job_links = {} + sorted_dict = sorted(target_results.items(), key=lambda t: t[0]) + for job, job_result in sorted_dict: + if job.startswith("models_"): + job = job[len("models_") :] + job_links[job] = job_result["job_link"] + + with open(f"ci_results_{job_name}/job_links.json", "w", encoding="UTF-8") as fp: + json.dump(job_links, fp, indent=4, ensure_ascii=False) + + api.upload_file( + path_or_fileobj=f"ci_results_{job_name}/job_links.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/job_links.json", + repo_id=report_repo_id, + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + prev_workflow_run_id = None other_workflow_run_ids = [] if is_scheduled_ci_run: - # TODO: remove `if job_name == "run_models_gpu"` - if job_name == "run_models_gpu": - prev_workflow_run_id = get_last_daily_ci_workflow_run_id( - token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=workflow_id + prev_workflow_run_id = get_last_daily_ci_workflow_run_id( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=workflow_id + ) + # For a scheduled run that is not the Nvidia's scheduled daily CI, add Nvidia's scheduled daily CI run as a target to compare. + if not is_nvidia_daily_ci_workflow: + # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it). + other_workflow_id = "90575235" + # We need to get the Nvidia's scheduled daily CI run that match the current run (i.e. run with the same commit SHA) + other_workflow_run_id = get_last_daily_ci_workflow_run_id( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha ) - # For a scheduled run that is not the Nvidia's scheduled daily CI, add Nvidia's scheduled daily CI run as a target to compare. - if not is_nvidia_daily_ci_workflow: - # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it). - other_workflow_id = "90575235" - # We need to get the Nvidia's scheduled daily CI run that match the current run (i.e. run with the same commit SHA) - other_workflow_run_id = get_last_daily_ci_workflow_run_id( - token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha - ) - other_workflow_run_ids.append(other_workflow_run_id) + other_workflow_run_ids.append(other_workflow_run_id) else: prev_workflow_run_id = os.environ["PREV_WORKFLOW_RUN_ID"] other_workflow_run_id = os.environ["OTHER_WORKFLOW_RUN_ID"] @@ -1359,13 +1415,6 @@ if __name__ == "__main__": else: other_ci_artifacts.append((target_workflow_run_id, ci_artifacts)) - job_to_test_map.update( - { - "run_models_gpu": "Models", - "run_trainer_and_fsdp_gpu": "Trainer & FSDP", - } - ) - ci_name_in_report = "" if job_name in job_to_test_map: ci_name_in_report = job_to_test_map[job_name] diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py index dc9678c7812..b533a7a9cf1 100644 --- a/utils/notification_service_quantization.py +++ b/utils/notification_service_quantization.py @@ -274,11 +274,13 @@ if __name__ == "__main__": with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp: json.dump(quantization_results, fp, indent=4, ensure_ascii=False) + report_repo_id = os.getenv("REPORT_REPO_ID") + # upload results to Hub dataset (only for the scheduled daily CI run on `main`) api.upload_file( path_or_fileobj=f"ci_results_{job_name}/quantization_results.json", path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/quantization_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", + repo_id=report_repo_id, repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index 50c338b6335..432291faec2 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -1,4 +1,4 @@ -"""An internal script to process `new_model_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`. +"""An internal script to process `new_failures_with_bad_commit.json` produced by `utils/check_bad_commit.py`. This is used by `.github/workflows/check_failed_model_tests.yml` to produce a slack report of the following form @@ -24,11 +24,13 @@ from huggingface_hub import HfApi if __name__ == "__main__": api = HfApi() - with open("new_model_failures_with_bad_commit.json") as fp: + job_name = os.environ.get("JOB_NAME") + + with open("new_failures_with_bad_commit.json") as fp: data = json.load(fp) - with open("ci_results_run_models_gpu/model_job_links.json") as fp: - model_job_links = json.load(fp) + with open(f"ci_results_{job_name}/job_links.json") as fp: + job_links = json.load(fp) # TODO: extend team_members = [ @@ -67,7 +69,11 @@ if __name__ == "__main__": for device, failed_tests in model_result.items(): # prepare job_link and add it to each entry of new failed test information. # need to change from `single-gpu` to `single` and same for `multi-gpu` to match `job_link`. - job_link = model_job_links[model][device.replace("-gpu", "")] + key = model + if list(job_links.keys()) == [job_name]: + key = job_name + job_link = job_links[key][device.replace("-gpu", "")] + failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author] for x in failed_tests: x.update({"job_link": job_link}) @@ -92,16 +98,18 @@ if __name__ == "__main__": if report_repo_subfolder: report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}" - with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: + report_repo_id = os.getenv("REPORT_REPO_ID") + + with open("new_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: json.dump(new_data_full, fp, ensure_ascii=False, indent=4) commit_info = api.upload_file( - path_or_fileobj="new_model_failures_with_bad_commit_grouped_by_authors.json", - path_in_repo=f"{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json", - repo_id="hf-internal-testing/transformers_daily_ci", + path_or_fileobj="new_failures_with_bad_commit_grouped_by_authors.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json", + repo_id=report_repo_id, repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) - url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json" + url = f"https://huggingface.co/datasets/{report_repo_id}/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit_grouped_by_authors.json" # Add `GH_` prefix as keyword mention output = {}