diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 5fc037fec20..36c113190ca 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -593,8 +593,7 @@ jobs: secrets: inherit check_new_failures: - # TODO: work on `run_quantization_torch_gpu` - if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job != 'run_quantization_torch_gpu' && needs.send_results.result == 'success' }} + if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }} name: Check new failures needs: send_results uses: ./.github/workflows/check_failed_tests.yml diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index c6aa336e8f4..5ef74946964 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -58,7 +58,7 @@ jobs: fi - name: Send message to Slack - if: ${{ inputs.job != 'run_quantization_torch_gpu' }} + shell: bash env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} @@ -79,7 +79,11 @@ jobs: pip install huggingface_hub pip install slack_sdk pip show slack_sdk - python utils/notification_service.py "${{ inputs.folder_slices }}" + if [ "${{ inputs.quantization_matrix }}" != "" ]; then + python utils/notification_service.py "${{ inputs.quantization_matrix }}" + else + python utils/notification_service.py "${{ inputs.folder_slices }}" + fi # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts @@ -87,32 +91,3 @@ jobs: with: name: ci_results_${{ inputs.job }} path: ci_results_${{ inputs.job }} - - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - - name: Send message to Slack for quantization workflow - if: ${{ inputs.job == 'run_quantization_torch_gpu' }} - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} - CI_EVENT: ${{ inputs.ci_event }} - CI_SHA: ${{ github.sha }} - CI_TEST_JOB: ${{ inputs.job }} - SETUP_STATUS: ${{ inputs.setup_status }} - REPORT_REPO_ID: ${{ inputs.report_repo_id }} - # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change - # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. - run: | - pip install huggingface_hub - pip install slack_sdk - pip show slack_sdk - python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" - - # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - - name: Failure table artifacts - if: ${{ inputs.job == 'run_quantization_torch_gpu' }} - uses: actions/upload-artifact@v4 - with: - name: ci_results_${{ inputs.job }} - path: ci_results_${{ inputs.job }} diff --git a/utils/notification_service.py b/utils/notification_service.py index 82e6cd1add5..5c54809b262 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -31,8 +31,7 @@ from slack_sdk import WebClient # A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of -# `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a -# specific job. See below. +# `additional_files`. job_to_test_map = { "run_models_gpu": "Models", "run_trainer_and_fsdp_gpu": "Trainer & FSDP", @@ -40,6 +39,18 @@ job_to_test_map = { "run_pipelines_tf_gpu": "TensorFlow pipelines", "run_examples_gpu": "Examples directory", "run_torch_cuda_extensions_gpu": "DeepSpeed", + "run_quantization_torch_gpu": "Quantization", +} + +# The values are used as the file names where to save the corresponding CI job results. +test_to_result_name = { + "Models": "model", + "Trainer & FSDP": "trainer_and_fsdp", + "PyTorch pipelines": "torch_pipeline", + "TensorFlow pipelines": "tf_pipeline", + "Examples directory": "example", + "DeepSpeed": "deepspeed", + "Quantization": "quantization", } NON_MODEL_TEST_MODULES = [ @@ -53,6 +64,8 @@ NON_MODEL_TEST_MODULES = [ "sagemaker", "trainer", "utils", + "fsdp", + "quantization", ] @@ -221,7 +234,6 @@ class Message: "type": "plain_text", "text": ( f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n" - f"Number of model failures: {self.n_model_failures}.\n" f"The suite ran in {self.time}." ), "emoji": True, @@ -276,6 +288,10 @@ class Message: @property def category_failures(self) -> Dict: + if job_name != "run_models_gpu": + category_failures_report = "" + return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}} + model_failures = [v["failed"] for v in self.model_results.values()] category_failures = {} @@ -301,7 +317,7 @@ class Message: header = "Single | Multi | Category\n" category_failures_report = prepare_reports( - title="The following modeling categories had failures", header=header, reports=individual_reports + title="The following categories had failures", header=header, reports=individual_reports ) return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}} @@ -355,25 +371,40 @@ class Message: } for k, v in self.model_results.items(): + # The keys in `model_results` may contain things like `models_vit` or `quantization_autoawq` + # Remove the prefix to make the report cleaner. + k = k.replace("models_", "").replace("quantization_", "") if k in NON_MODEL_TEST_MODULES: - pass + continue if sum(per_model_sum(v).values()): dict_failed = dict(v["failed"]) - pytorch_specific_failures = dict_failed.pop("PyTorch") - tensorflow_specific_failures = dict_failed.pop("TensorFlow") - other_failures = dicts_to_sum(dict_failed.values()) - failures[k] = { - "PyTorch": pytorch_specific_failures, - "TensorFlow": tensorflow_specific_failures, - "other": other_failures, - } + # Model job has a special form for reporting + if job_name == "run_models_gpu": + pytorch_specific_failures = dict_failed.pop("PyTorch") + tensorflow_specific_failures = dict_failed.pop("TensorFlow") + other_failures = dicts_to_sum(dict_failed.values()) + + failures[k] = { + "PyTorch": pytorch_specific_failures, + "TensorFlow": tensorflow_specific_failures, + "other": other_failures, + } + + else: + test_name = job_to_test_map[job_name] + specific_failures = dict_failed.pop(test_name) + failures[k] = { + test_name: specific_failures, + } model_reports = [] other_module_reports = [] for key, value in non_model_failures.items(): + key = key.replace("models_", "").replace("quantization_", "") + if key in NON_MODEL_TEST_MODULES: device_report = self.get_device_report(value) @@ -386,44 +417,60 @@ class Message: other_module_reports.append(report) for key, value in failures.items(): - device_report_values = [ - value["PyTorch"]["single"], - value["PyTorch"]["multi"], - value["TensorFlow"]["single"], - value["TensorFlow"]["multi"], - sum(value["other"].values()), - ] + # Model job has a special form for reporting + if job_name == "run_models_gpu": + device_report_values = [ + value["PyTorch"]["single"], + value["PyTorch"]["multi"], + value["TensorFlow"]["single"], + value["TensorFlow"]["multi"], + sum(value["other"].values()), + ] + + else: + test_name = job_to_test_map[job_name] + device_report_values = [ + value[test_name]["single"], + value[test_name]["multi"], + ] if sum(device_report_values): - device_report = " | ".join([str(x).rjust(9) for x in device_report_values]) + " | " + # This is related to `model_header` below + rjust_width = 9 if job_name == "run_models_gpu" else 6 + device_report = " | ".join([str(x).rjust(rjust_width) for x in device_report_values]) + " | " report = f"{device_report}{key}" model_reports.append(report) # (Possibly truncated) reports for the current workflow run - to be sent to Slack channels - model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n" + if job_name == "run_models_gpu": + model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n" + else: + model_header = "Single | Multi | Category\n" + + # Used when calling `prepare_reports` below to prepare the `title` argument + label = test_to_result_name[job_to_test_map[job_name]] + sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1]) model_failures_report = prepare_reports( - title="These following model modules had failures", header=model_header, reports=sorted_model_reports + title=f"These following {label} modules had failures", header=model_header, reports=sorted_model_reports ) module_header = "Single | Multi | Category\n" sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1]) module_failures_report = prepare_reports( - title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports + title=f"The following {label} modules had failures", header=module_header, reports=sorted_module_reports ) # To be sent to Slack channels - model_failure_sections = [ - {"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}}, - {"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}}, - ] + model_failure_sections = [{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}}] + model_failure_sections.append({"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}}) # Save the complete (i.e. no truncation) failure tables (of the current workflow run) # (to be uploaded as artifacts) model_failures_report = prepare_reports( - title="These following model modules had failures", + title=f"These following {label} modules had failures", header=model_header, reports=sorted_model_reports, to_truncate=False, @@ -433,7 +480,7 @@ class Message: fp.write(model_failures_report) module_failures_report = prepare_reports( - title="The following non-model modules had failures", + title=f"The following {label} modules had failures", header=module_header, reports=sorted_module_reports, to_truncate=False, @@ -511,7 +558,10 @@ class Message: blocks.append(self.failures) if self.n_model_failures > 0: - blocks.append(self.category_failures) + block = self.category_failures + if block["text"]["text"]: + blocks.append(block) + for block in self.model_failures: if block["text"]["text"]: blocks.append(block) @@ -565,7 +615,7 @@ class Message: pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>" items = re.findall(pattern, line) elif "tests/" in line: - if "tests/models/" in line: + if "tests/models/" in line or "tests/quantization/" in line: model = line.split("/")[2] else: model = line.split("/")[1] @@ -609,7 +659,7 @@ class Message: "text": { "type": "mrkdwn", # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment. - "text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to Nvidia CI: )", + "text": f"*There are {nb_new_failed_tests} failed tests unique to {'this run' if not is_amd_daily_ci_workflow else 'AMD'}*\n\n(compared to Nvidia CI: )", }, "accessory": { "type": "button", @@ -1058,13 +1108,24 @@ if __name__ == "__main__": # In our usage in `.github/workflows/slack-report.yml`, we always pass an argument when calling this script. # The argument could be an empty string `""` if a job doesn't depend on the job `setup`. if arguments[0] == "": - models = [] + job_matrix = [] else: - model_list_as_str = arguments[0] + job_matrix_as_str = arguments[0] try: - folder_slices = ast.literal_eval(model_list_as_str) - # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). - models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] + folder_slices = ast.literal_eval(job_matrix_as_str) + if len(folder_slices) > 0: + if isinstance(folder_slices[0], list): + # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). + job_matrix = [ + x.replace("models/", "models_").replace("quantization/", "quantization_") + for folders in folder_slices + for x in folders + ] + elif isinstance(folder_slices[0], str): + job_matrix = [ + x.replace("models/", "models_").replace("quantization/", "quantization_") + for x in folder_slices + ] except Exception: Message.error_out(title, ci_title) raise ValueError("Errored out.") @@ -1084,7 +1145,7 @@ if __name__ == "__main__": available_artifacts = retrieve_available_artifacts() - modeling_categories = [ + test_categories = [ "PyTorch", "TensorFlow", "Flax", @@ -1093,35 +1154,34 @@ if __name__ == "__main__": "Trainer", "ONNX", "Auto", + "Quantization", "Unclassified", ] job_name = os.getenv("CI_TEST_JOB") - report_name_prefix = "run_models_gpu" - if job_name == "run_trainer_and_fsdp_gpu": - report_name_prefix = job_name + report_name_prefix = job_name # This dict will contain all the information relative to each model: # - Failures: the total, as well as the number of failures per-category defined above # - Success: total # - Time spent: as a comma-separated list of elapsed time # - Failures: as a line-break separated list of errors - model_results = { - model: { - "failed": {m: {"unclassified": 0, "single": 0, "multi": 0} for m in modeling_categories}, + matrix_job_results = { + matrix_name: { + "failed": {m: {"unclassified": 0, "single": 0, "multi": 0} for m in test_categories}, "success": 0, "time_spent": "", "failures": {}, "job_link": {}, } - for model in models - if f"{report_name_prefix}_{model}_test_reports" in available_artifacts + for matrix_name in job_matrix + if f"{report_name_prefix}_{matrix_name}_test_reports" in available_artifacts } unclassified_model_failures = [] - for model in model_results.keys(): - for artifact_path_dict in available_artifacts[f"{report_name_prefix}_{model}_test_reports"].paths: + for matrix_name in matrix_job_results.keys(): + for artifact_path_dict in available_artifacts[f"{report_name_prefix}_{matrix_name}_test_reports"].paths: path = artifact_path_dict["path"] artifact_gpu = artifact_path_dict["gpu"] @@ -1133,13 +1193,14 @@ if __name__ == "__main__": if "stats" in artifact: # Link to the GitHub Action job job = artifact_name_to_job_map[path] - model_results[model]["job_link"][artifact_gpu] = job["html_url"] + matrix_job_results[matrix_name]["job_link"][artifact_gpu] = job["html_url"] failed, success, time_spent = handle_test_results(artifact["stats"]) - model_results[model]["success"] += success - model_results[model]["time_spent"] += time_spent[1:-1] + ", " + matrix_job_results[matrix_name]["success"] += success + matrix_job_results[matrix_name]["time_spent"] += time_spent[1:-1] + ", " stacktraces = handle_stacktraces(artifact["failures_line"]) + # TODO: ??? for line in artifact["summary_short"].split("\n"): if line.startswith("FAILED "): # Avoid the extra `FAILED` entry given by `run_test_using_subprocess` causing issue when calling @@ -1150,38 +1211,45 @@ if __name__ == "__main__": line = line[len("FAILED ") :] line = line.split()[0].replace("\n", "") - if artifact_gpu not in model_results[model]["failures"]: - model_results[model]["failures"][artifact_gpu] = [] + if artifact_gpu not in matrix_job_results[matrix_name]["failures"]: + matrix_job_results[matrix_name]["failures"][artifact_gpu] = [] trace = pop_default(stacktraces, 0, "Cannot retrieve error message.") - model_results[model]["failures"][artifact_gpu].append({"line": line, "trace": trace}) + matrix_job_results[matrix_name]["failures"][artifact_gpu].append( + {"line": line, "trace": trace} + ) - if re.search("test_modeling_tf_", line): - model_results[model]["failed"]["TensorFlow"][artifact_gpu] += 1 + # TODO: How to deal wit this + + if re.search("tests/quantization", line): + matrix_job_results[matrix_name]["failed"]["Quantization"][artifact_gpu] += 1 + + elif re.search("test_modeling_tf_", line): + matrix_job_results[matrix_name]["failed"]["TensorFlow"][artifact_gpu] += 1 elif re.search("test_modeling_flax_", line): - model_results[model]["failed"]["Flax"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Flax"][artifact_gpu] += 1 elif re.search("test_modeling", line): - model_results[model]["failed"]["PyTorch"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["PyTorch"][artifact_gpu] += 1 elif re.search("test_tokenization", line): - model_results[model]["failed"]["Tokenizers"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Tokenizers"][artifact_gpu] += 1 elif re.search("test_pipelines", line): - model_results[model]["failed"]["Pipelines"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Pipelines"][artifact_gpu] += 1 elif re.search("test_trainer", line): - model_results[model]["failed"]["Trainer"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Trainer"][artifact_gpu] += 1 elif re.search("onnx", line): - model_results[model]["failed"]["ONNX"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["ONNX"][artifact_gpu] += 1 elif re.search("auto", line): - model_results[model]["failed"]["Auto"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Auto"][artifact_gpu] += 1 else: - model_results[model]["failed"]["Unclassified"][artifact_gpu] += 1 + matrix_job_results[matrix_name]["failed"]["Unclassified"][artifact_gpu] += 1 unclassified_model_failures.append(line) # Additional runs @@ -1315,20 +1383,10 @@ if __name__ == "__main__": if "workflow_run" in event_payload: is_scheduled_ci_run = event_payload["workflow_run"]["event"] == "schedule" - # The values are used as the file names where to save the corresponding CI job results. - test_to_result_name = { - "Models": "model", - "Trainer & FSDP": "trainer_and_fsdp", - "PyTorch pipelines": "torch_pipeline", - "TensorFlow pipelines": "tf_pipeline", - "Examples directory": "example", - "DeepSpeed": "deepspeed", - } - test_name_and_result_pairs = [] - if len(model_results) > 0: + if len(matrix_job_results) > 0: test_name = job_to_test_map[job_name] - test_name_and_result_pairs.append((test_name, model_results)) + test_name_and_result_pairs.append((test_name, matrix_job_results)) for test_name, result in additional_results.items(): test_name_and_result_pairs.append((test_name, result)) @@ -1346,8 +1404,8 @@ if __name__ == "__main__": ) # Let's create a file contain job --> job link - if len(model_results) > 0: - target_results = model_results + if len(matrix_job_results) > 0: + target_results = matrix_job_results else: target_results = additional_results[job_to_test_map[job_name]] @@ -1360,6 +1418,8 @@ if __name__ == "__main__": for job, job_result in sorted_dict: if job.startswith("models_"): job = job[len("models_") :] + elif job.startswith("quantization_"): + job = job[len("quantization_") :] job_links[job] = job_result["job_link"] with open(f"ci_results_{job_name}/job_links.json", "w", encoding="UTF-8") as fp: @@ -1424,7 +1484,7 @@ if __name__ == "__main__": message = Message( title, ci_title, - model_results, + matrix_job_results, additional_results, selected_warnings=selected_warnings, prev_ci_artifacts=prev_ci_artifacts, diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py deleted file mode 100644 index b533a7a9cf1..00000000000 --- a/utils/notification_service_quantization.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ast -import json -import os -import sys -import time -from typing import Dict - -from get_ci_error_statistics import get_jobs -from get_previous_daily_ci import get_last_daily_ci_run -from huggingface_hub import HfApi -from notification_service import ( - Message, - handle_stacktraces, - handle_test_results, - prepare_reports, - retrieve_artifact, - retrieve_available_artifacts, -) -from slack_sdk import WebClient - - -api = HfApi() -client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) - - -class QuantizationMessage(Message): - def __init__( - self, - title: str, - results: Dict, - ): - self.title = title - - # Failures and success of the modeling tests - self.n_success = sum(r["success"] for r in results.values()) - self.single_gpu_failures = sum(r["failed"]["single"] for r in results.values()) - self.multi_gpu_failures = sum(r["failed"]["multi"] for r in results.values()) - self.n_failures = self.single_gpu_failures + self.multi_gpu_failures - - self.n_tests = self.n_failures + self.n_success - self.results = results - self.thread_ts = None - - @property - def payload(self) -> str: - blocks = [self.header] - - if self.n_failures > 0: - blocks.append(self.failures_overwiew) - blocks.append(self.failures_detailed) - - if self.n_failures == 0: - blocks.append(self.no_failures) - - return json.dumps(blocks) - - @property - def time(self) -> str: - all_results = self.results.values() - time_spent = [] - for r in all_results: - if len(r["time_spent"]): - time_spent.extend([x for x in r["time_spent"].split(", ") if len(x.strip())]) - total_secs = 0 - - for time in time_spent: - time_parts = time.split(":") - - # Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute. - if len(time_parts) == 1: - time_parts = [0, 0, time_parts[0]] - - hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2]) - total_secs += hours * 3600 + minutes * 60 + seconds - - hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60 - return f"{int(hours)}h{int(minutes)}m{int(seconds)}s" - - @property - def failures_overwiew(self) -> Dict: - return { - "type": "section", - "text": { - "type": "plain_text", - "text": ( - f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n" - f"The suite ran in {self.time}." - ), - "emoji": True, - }, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, - "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}", - }, - } - - @property - def failures_detailed(self) -> Dict: - failures = {k: v["failed"] for k, v in self.results.items()} - - individual_reports = [] - for key, value in failures.items(): - device_report = self.get_device_report(value) - if sum(value.values()): - report = f"{device_report}{key}" - individual_reports.append(report) - - header = "Single | Multi | Category\n" - failures_report = prepare_reports( - title="The following quantization tests had failures", header=header, reports=individual_reports - ) - - return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}} - - def post(self): - payload = self.payload - print("Sending the following payload") - print(json.dumps({"blocks": json.loads(payload)})) - - text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." - - self.thread_ts = client.chat_postMessage( - channel=SLACK_REPORT_CHANNEL_ID, - blocks=payload, - text=text, - ) - - def post_reply(self): - if self.thread_ts is None: - raise ValueError("Can only post reply if a post has been made.") - - for job, job_result in self.results.items(): - if len(job_result["failures"]): - for device, failures in job_result["failures"].items(): - blocks = self.get_reply_blocks( - job, - job_result, - failures, - device, - text=f"Number of failures: {job_result['failed'][device]}", - ) - - print("Sending the following reply") - print(json.dumps({"blocks": blocks})) - - client.chat_postMessage( - channel="#transformers-ci-daily-quantization", - text=f"Results for {job}", - blocks=blocks, - thread_ts=self.thread_ts["ts"], - ) - time.sleep(1) - - -if __name__ == "__main__": - setup_status = os.environ.get("SETUP_STATUS") - SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"] - setup_failed = True if setup_status is not None and setup_status != "success" else False - - # This env. variable is set in workflow file (under the job `send_results`). - ci_event = os.environ["CI_EVENT"] - - title = f"🤗 Results of the {ci_event} - {os.getenv('CI_TEST_JOB')}." - - if setup_failed: - Message.error_out( - title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=setup_failed - ) - exit(0) - - arguments = sys.argv[1:][0] - try: - quantization_matrix = ast.literal_eval(arguments) - # Need to change from elements like `quantization/bnb` to `quantization_bnb` (the ones used as artifact names). - quantization_matrix = [x.replace("quantization/", "quantization_") for x in quantization_matrix] - except SyntaxError: - Message.error_out(title, ci_title="") - raise ValueError("Errored out.") - - available_artifacts = retrieve_available_artifacts() - - quantization_results = { - quant: { - "failed": {"single": 0, "multi": 0}, - "success": 0, - "time_spent": "", - "failures": {}, - "job_link": {}, - } - for quant in quantization_matrix - if f"run_quantization_torch_gpu_{quant}_test_reports" in available_artifacts - } - - github_actions_jobs = get_jobs( - workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"] - ) - github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs} - - artifact_name_to_job_map = {} - for job in github_actions_jobs: - for step in job["steps"]: - if step["name"].startswith("Test suite reports artifacts: "): - artifact_name = step["name"][len("Test suite reports artifacts: ") :] - artifact_name_to_job_map[artifact_name] = job - break - - for quant in quantization_results.keys(): - for artifact_path in available_artifacts[f"run_quantization_torch_gpu_{quant}_test_reports"].paths: - artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"]) - if "stats" in artifact: - # Link to the GitHub Action job - job = artifact_name_to_job_map[artifact_path["path"]] - quantization_results[quant]["job_link"][artifact_path["gpu"]] = job["html_url"] - failed, success, time_spent = handle_test_results(artifact["stats"]) - quantization_results[quant]["failed"][artifact_path["gpu"]] += failed - quantization_results[quant]["success"] += success - quantization_results[quant]["time_spent"] += time_spent[1:-1] + ", " - - stacktraces = handle_stacktraces(artifact["failures_line"]) - - for line in artifact["summary_short"].split("\n"): - if line.startswith("FAILED "): - line = line[len("FAILED ") :] - line = line.split()[0].replace("\n", "") - - if artifact_path["gpu"] not in quantization_results[quant]["failures"]: - quantization_results[quant]["failures"][artifact_path["gpu"]] = [] - - quantization_results[quant]["failures"][artifact_path["gpu"]].append( - {"line": line, "trace": stacktraces.pop(0)} - ) - - job_name = os.getenv("CI_TEST_JOB") - - # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder` - report_repo_subfolder = "" - if os.getenv("GITHUB_EVENT_NAME") != "schedule": - report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}" - report_repo_subfolder = f"runs/{report_repo_subfolder}" - - workflow_run = get_last_daily_ci_run( - token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID") - ) - workflow_run_created_time = workflow_run["created_at"] - workflow_id = workflow_run["workflow_id"] - - report_repo_folder = workflow_run_created_time.split("T")[0] - - if report_repo_subfolder: - report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}" - - if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")): - os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) - - nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml" - is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow) - is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule" - - with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp: - json.dump(quantization_results, fp, indent=4, ensure_ascii=False) - - report_repo_id = os.getenv("REPORT_REPO_ID") - - # upload results to Hub dataset (only for the scheduled daily CI run on `main`) - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/quantization_results.json", - path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/quantization_results.json", - repo_id=report_repo_id, - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) - - message = QuantizationMessage( - title, - results=quantization_results, - ) - - message.post() - message.post_reply()