mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 04:40:06 +06:00
Use one utils/notification_service.py
(#38379)
* step 1 * step 2 * step 3 * step 4 * step 5 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
98328fd9a1
commit
eb74cf977b
3
.github/workflows/self-scheduled.yml
vendored
3
.github/workflows/self-scheduled.yml
vendored
@ -593,8 +593,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
check_new_failures:
|
||||
# TODO: work on `run_quantization_torch_gpu`
|
||||
if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job != 'run_quantization_torch_gpu' && needs.send_results.result == 'success' }}
|
||||
if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
|
||||
name: Check new failures
|
||||
needs: send_results
|
||||
uses: ./.github/workflows/check_failed_tests.yml
|
||||
|
37
.github/workflows/slack-report.yml
vendored
37
.github/workflows/slack-report.yml
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Send message to Slack
|
||||
if: ${{ inputs.job != 'run_quantization_torch_gpu' }}
|
||||
shell: bash
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
@ -79,7 +79,11 @@ jobs:
|
||||
pip install huggingface_hub
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
||||
if [ "${{ inputs.quantization_matrix }}" != "" ]; then
|
||||
python utils/notification_service.py "${{ inputs.quantization_matrix }}"
|
||||
else
|
||||
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
||||
fi
|
||||
|
||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||
- name: Failure table artifacts
|
||||
@ -87,32 +91,3 @@ jobs:
|
||||
with:
|
||||
name: ci_results_${{ inputs.job }}
|
||||
path: ci_results_${{ inputs.job }}
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/download-artifact@v4
|
||||
- name: Send message to Slack for quantization workflow
|
||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
|
||||
CI_EVENT: ${{ inputs.ci_event }}
|
||||
CI_SHA: ${{ github.sha }}
|
||||
CI_TEST_JOB: ${{ inputs.job }}
|
||||
SETUP_STATUS: ${{ inputs.setup_status }}
|
||||
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
|
||||
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
|
||||
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install huggingface_hub
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
|
||||
|
||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||
- name: Failure table artifacts
|
||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ci_results_${{ inputs.job }}
|
||||
path: ci_results_${{ inputs.job }}
|
||||
|
@ -31,8 +31,7 @@ from slack_sdk import WebClient
|
||||
|
||||
|
||||
# A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of
|
||||
# `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a
|
||||
# specific job. See below.
|
||||
# `additional_files`.
|
||||
job_to_test_map = {
|
||||
"run_models_gpu": "Models",
|
||||
"run_trainer_and_fsdp_gpu": "Trainer & FSDP",
|
||||
@ -40,6 +39,18 @@ job_to_test_map = {
|
||||
"run_pipelines_tf_gpu": "TensorFlow pipelines",
|
||||
"run_examples_gpu": "Examples directory",
|
||||
"run_torch_cuda_extensions_gpu": "DeepSpeed",
|
||||
"run_quantization_torch_gpu": "Quantization",
|
||||
}
|
||||
|
||||
# The values are used as the file names where to save the corresponding CI job results.
|
||||
test_to_result_name = {
|
||||
"Models": "model",
|
||||
"Trainer & FSDP": "trainer_and_fsdp",
|
||||
"PyTorch pipelines": "torch_pipeline",
|
||||
"TensorFlow pipelines": "tf_pipeline",
|
||||
"Examples directory": "example",
|
||||
"DeepSpeed": "deepspeed",
|
||||
"Quantization": "quantization",
|
||||
}
|
||||
|
||||
NON_MODEL_TEST_MODULES = [
|
||||
@ -53,6 +64,8 @@ NON_MODEL_TEST_MODULES = [
|
||||
"sagemaker",
|
||||
"trainer",
|
||||
"utils",
|
||||
"fsdp",
|
||||
"quantization",
|
||||
]
|
||||
|
||||
|
||||
@ -221,7 +234,6 @@ class Message:
|
||||
"type": "plain_text",
|
||||
"text": (
|
||||
f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
|
||||
f"Number of model failures: {self.n_model_failures}.\n"
|
||||
f"The suite ran in {self.time}."
|
||||
),
|
||||
"emoji": True,
|
||||
@ -276,6 +288,10 @@ class Message:
|
||||
|
||||
@property
|
||||
def category_failures(self) -> Dict:
|
||||
if job_name != "run_models_gpu":
|
||||
category_failures_report = ""
|
||||
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
|
||||
|
||||
model_failures = [v["failed"] for v in self.model_results.values()]
|
||||
|
||||
category_failures = {}
|
||||
@ -301,7 +317,7 @@ class Message:
|
||||
|
||||
header = "Single | Multi | Category\n"
|
||||
category_failures_report = prepare_reports(
|
||||
title="The following modeling categories had failures", header=header, reports=individual_reports
|
||||
title="The following categories had failures", header=header, reports=individual_reports
|
||||
)
|
||||
|
||||
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
|
||||
@ -355,25 +371,40 @@ class Message:
|
||||
}
|
||||
|
||||
for k, v in self.model_results.items():
|
||||
# The keys in `model_results` may contain things like `models_vit` or `quantization_autoawq`
|
||||
# Remove the prefix to make the report cleaner.
|
||||
k = k.replace("models_", "").replace("quantization_", "")
|
||||
if k in NON_MODEL_TEST_MODULES:
|
||||
pass
|
||||
continue
|
||||
|
||||
if sum(per_model_sum(v).values()):
|
||||
dict_failed = dict(v["failed"])
|
||||
pytorch_specific_failures = dict_failed.pop("PyTorch")
|
||||
tensorflow_specific_failures = dict_failed.pop("TensorFlow")
|
||||
other_failures = dicts_to_sum(dict_failed.values())
|
||||
|
||||
failures[k] = {
|
||||
"PyTorch": pytorch_specific_failures,
|
||||
"TensorFlow": tensorflow_specific_failures,
|
||||
"other": other_failures,
|
||||
}
|
||||
# Model job has a special form for reporting
|
||||
if job_name == "run_models_gpu":
|
||||
pytorch_specific_failures = dict_failed.pop("PyTorch")
|
||||
tensorflow_specific_failures = dict_failed.pop("TensorFlow")
|
||||
other_failures = dicts_to_sum(dict_failed.values())
|
||||
|
||||
failures[k] = {
|
||||
"PyTorch": pytorch_specific_failures,
|
||||
"TensorFlow": tensorflow_specific_failures,
|
||||
"other": other_failures,
|
||||
}
|
||||
|
||||
else:
|
||||
test_name = job_to_test_map[job_name]
|
||||
specific_failures = dict_failed.pop(test_name)
|
||||
failures[k] = {
|
||||
test_name: specific_failures,
|
||||
}
|
||||
|
||||
model_reports = []
|
||||
other_module_reports = []
|
||||
|
||||
for key, value in non_model_failures.items():
|
||||
key = key.replace("models_", "").replace("quantization_", "")
|
||||
|
||||
if key in NON_MODEL_TEST_MODULES:
|
||||
device_report = self.get_device_report(value)
|
||||
|
||||
@ -386,44 +417,60 @@ class Message:
|
||||
other_module_reports.append(report)
|
||||
|
||||
for key, value in failures.items():
|
||||
device_report_values = [
|
||||
value["PyTorch"]["single"],
|
||||
value["PyTorch"]["multi"],
|
||||
value["TensorFlow"]["single"],
|
||||
value["TensorFlow"]["multi"],
|
||||
sum(value["other"].values()),
|
||||
]
|
||||
# Model job has a special form for reporting
|
||||
if job_name == "run_models_gpu":
|
||||
device_report_values = [
|
||||
value["PyTorch"]["single"],
|
||||
value["PyTorch"]["multi"],
|
||||
value["TensorFlow"]["single"],
|
||||
value["TensorFlow"]["multi"],
|
||||
sum(value["other"].values()),
|
||||
]
|
||||
|
||||
else:
|
||||
test_name = job_to_test_map[job_name]
|
||||
device_report_values = [
|
||||
value[test_name]["single"],
|
||||
value[test_name]["multi"],
|
||||
]
|
||||
|
||||
if sum(device_report_values):
|
||||
device_report = " | ".join([str(x).rjust(9) for x in device_report_values]) + " | "
|
||||
# This is related to `model_header` below
|
||||
rjust_width = 9 if job_name == "run_models_gpu" else 6
|
||||
device_report = " | ".join([str(x).rjust(rjust_width) for x in device_report_values]) + " | "
|
||||
report = f"{device_report}{key}"
|
||||
|
||||
model_reports.append(report)
|
||||
|
||||
# (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
|
||||
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
|
||||
if job_name == "run_models_gpu":
|
||||
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
|
||||
else:
|
||||
model_header = "Single | Multi | Category\n"
|
||||
|
||||
# Used when calling `prepare_reports` below to prepare the `title` argument
|
||||
label = test_to_result_name[job_to_test_map[job_name]]
|
||||
|
||||
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
|
||||
model_failures_report = prepare_reports(
|
||||
title="These following model modules had failures", header=model_header, reports=sorted_model_reports
|
||||
title=f"These following {label} modules had failures", header=model_header, reports=sorted_model_reports
|
||||
)
|
||||
|
||||
module_header = "Single | Multi | Category\n"
|
||||
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
|
||||
module_failures_report = prepare_reports(
|
||||
title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
|
||||
title=f"The following {label} modules had failures", header=module_header, reports=sorted_module_reports
|
||||
)
|
||||
|
||||
# To be sent to Slack channels
|
||||
model_failure_sections = [
|
||||
{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
|
||||
{"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
|
||||
]
|
||||
model_failure_sections = [{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}}]
|
||||
model_failure_sections.append({"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}})
|
||||
|
||||
# Save the complete (i.e. no truncation) failure tables (of the current workflow run)
|
||||
# (to be uploaded as artifacts)
|
||||
|
||||
model_failures_report = prepare_reports(
|
||||
title="These following model modules had failures",
|
||||
title=f"These following {label} modules had failures",
|
||||
header=model_header,
|
||||
reports=sorted_model_reports,
|
||||
to_truncate=False,
|
||||
@ -433,7 +480,7 @@ class Message:
|
||||
fp.write(model_failures_report)
|
||||
|
||||
module_failures_report = prepare_reports(
|
||||
title="The following non-model modules had failures",
|
||||
title=f"The following {label} modules had failures",
|
||||
header=module_header,
|
||||
reports=sorted_module_reports,
|
||||
to_truncate=False,
|
||||
@ -511,7 +558,10 @@ class Message:
|
||||
blocks.append(self.failures)
|
||||
|
||||
if self.n_model_failures > 0:
|
||||
blocks.append(self.category_failures)
|
||||
block = self.category_failures
|
||||
if block["text"]["text"]:
|
||||
blocks.append(block)
|
||||
|
||||
for block in self.model_failures:
|
||||
if block["text"]["text"]:
|
||||
blocks.append(block)
|
||||
@ -565,7 +615,7 @@ class Message:
|
||||
pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>"
|
||||
items = re.findall(pattern, line)
|
||||
elif "tests/" in line:
|
||||
if "tests/models/" in line:
|
||||
if "tests/models/" in line or "tests/quantization/" in line:
|
||||
model = line.split("/")[2]
|
||||
else:
|
||||
model = line.split("/")[1]
|
||||
@ -609,7 +659,7 @@ class Message:
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
# TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment.
|
||||
"text": f"*There are {nb_new_failed_tests} failed tests unique to this run*\n\n(compared to Nvidia CI: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
|
||||
"text": f"*There are {nb_new_failed_tests} failed tests unique to {'this run' if not is_amd_daily_ci_workflow else 'AMD'}*\n\n(compared to Nvidia CI: <https://github.com/huggingface/transformers/actions/runs/{prev_workflow_run_id}|{prev_workflow_run_id}>)",
|
||||
},
|
||||
"accessory": {
|
||||
"type": "button",
|
||||
@ -1058,13 +1108,24 @@ if __name__ == "__main__":
|
||||
# In our usage in `.github/workflows/slack-report.yml`, we always pass an argument when calling this script.
|
||||
# The argument could be an empty string `""` if a job doesn't depend on the job `setup`.
|
||||
if arguments[0] == "":
|
||||
models = []
|
||||
job_matrix = []
|
||||
else:
|
||||
model_list_as_str = arguments[0]
|
||||
job_matrix_as_str = arguments[0]
|
||||
try:
|
||||
folder_slices = ast.literal_eval(model_list_as_str)
|
||||
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
|
||||
models = [x.replace("models/", "models_") for folders in folder_slices for x in folders]
|
||||
folder_slices = ast.literal_eval(job_matrix_as_str)
|
||||
if len(folder_slices) > 0:
|
||||
if isinstance(folder_slices[0], list):
|
||||
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
|
||||
job_matrix = [
|
||||
x.replace("models/", "models_").replace("quantization/", "quantization_")
|
||||
for folders in folder_slices
|
||||
for x in folders
|
||||
]
|
||||
elif isinstance(folder_slices[0], str):
|
||||
job_matrix = [
|
||||
x.replace("models/", "models_").replace("quantization/", "quantization_")
|
||||
for x in folder_slices
|
||||
]
|
||||
except Exception:
|
||||
Message.error_out(title, ci_title)
|
||||
raise ValueError("Errored out.")
|
||||
@ -1084,7 +1145,7 @@ if __name__ == "__main__":
|
||||
|
||||
available_artifacts = retrieve_available_artifacts()
|
||||
|
||||
modeling_categories = [
|
||||
test_categories = [
|
||||
"PyTorch",
|
||||
"TensorFlow",
|
||||
"Flax",
|
||||
@ -1093,35 +1154,34 @@ if __name__ == "__main__":
|
||||
"Trainer",
|
||||
"ONNX",
|
||||
"Auto",
|
||||
"Quantization",
|
||||
"Unclassified",
|
||||
]
|
||||
|
||||
job_name = os.getenv("CI_TEST_JOB")
|
||||
report_name_prefix = "run_models_gpu"
|
||||
if job_name == "run_trainer_and_fsdp_gpu":
|
||||
report_name_prefix = job_name
|
||||
report_name_prefix = job_name
|
||||
|
||||
# This dict will contain all the information relative to each model:
|
||||
# - Failures: the total, as well as the number of failures per-category defined above
|
||||
# - Success: total
|
||||
# - Time spent: as a comma-separated list of elapsed time
|
||||
# - Failures: as a line-break separated list of errors
|
||||
model_results = {
|
||||
model: {
|
||||
"failed": {m: {"unclassified": 0, "single": 0, "multi": 0} for m in modeling_categories},
|
||||
matrix_job_results = {
|
||||
matrix_name: {
|
||||
"failed": {m: {"unclassified": 0, "single": 0, "multi": 0} for m in test_categories},
|
||||
"success": 0,
|
||||
"time_spent": "",
|
||||
"failures": {},
|
||||
"job_link": {},
|
||||
}
|
||||
for model in models
|
||||
if f"{report_name_prefix}_{model}_test_reports" in available_artifacts
|
||||
for matrix_name in job_matrix
|
||||
if f"{report_name_prefix}_{matrix_name}_test_reports" in available_artifacts
|
||||
}
|
||||
|
||||
unclassified_model_failures = []
|
||||
|
||||
for model in model_results.keys():
|
||||
for artifact_path_dict in available_artifacts[f"{report_name_prefix}_{model}_test_reports"].paths:
|
||||
for matrix_name in matrix_job_results.keys():
|
||||
for artifact_path_dict in available_artifacts[f"{report_name_prefix}_{matrix_name}_test_reports"].paths:
|
||||
path = artifact_path_dict["path"]
|
||||
artifact_gpu = artifact_path_dict["gpu"]
|
||||
|
||||
@ -1133,13 +1193,14 @@ if __name__ == "__main__":
|
||||
if "stats" in artifact:
|
||||
# Link to the GitHub Action job
|
||||
job = artifact_name_to_job_map[path]
|
||||
model_results[model]["job_link"][artifact_gpu] = job["html_url"]
|
||||
matrix_job_results[matrix_name]["job_link"][artifact_gpu] = job["html_url"]
|
||||
failed, success, time_spent = handle_test_results(artifact["stats"])
|
||||
model_results[model]["success"] += success
|
||||
model_results[model]["time_spent"] += time_spent[1:-1] + ", "
|
||||
matrix_job_results[matrix_name]["success"] += success
|
||||
matrix_job_results[matrix_name]["time_spent"] += time_spent[1:-1] + ", "
|
||||
|
||||
stacktraces = handle_stacktraces(artifact["failures_line"])
|
||||
|
||||
# TODO: ???
|
||||
for line in artifact["summary_short"].split("\n"):
|
||||
if line.startswith("FAILED "):
|
||||
# Avoid the extra `FAILED` entry given by `run_test_using_subprocess` causing issue when calling
|
||||
@ -1150,38 +1211,45 @@ if __name__ == "__main__":
|
||||
line = line[len("FAILED ") :]
|
||||
line = line.split()[0].replace("\n", "")
|
||||
|
||||
if artifact_gpu not in model_results[model]["failures"]:
|
||||
model_results[model]["failures"][artifact_gpu] = []
|
||||
if artifact_gpu not in matrix_job_results[matrix_name]["failures"]:
|
||||
matrix_job_results[matrix_name]["failures"][artifact_gpu] = []
|
||||
|
||||
trace = pop_default(stacktraces, 0, "Cannot retrieve error message.")
|
||||
model_results[model]["failures"][artifact_gpu].append({"line": line, "trace": trace})
|
||||
matrix_job_results[matrix_name]["failures"][artifact_gpu].append(
|
||||
{"line": line, "trace": trace}
|
||||
)
|
||||
|
||||
if re.search("test_modeling_tf_", line):
|
||||
model_results[model]["failed"]["TensorFlow"][artifact_gpu] += 1
|
||||
# TODO: How to deal wit this
|
||||
|
||||
if re.search("tests/quantization", line):
|
||||
matrix_job_results[matrix_name]["failed"]["Quantization"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_modeling_tf_", line):
|
||||
matrix_job_results[matrix_name]["failed"]["TensorFlow"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_modeling_flax_", line):
|
||||
model_results[model]["failed"]["Flax"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Flax"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_modeling", line):
|
||||
model_results[model]["failed"]["PyTorch"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["PyTorch"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_tokenization", line):
|
||||
model_results[model]["failed"]["Tokenizers"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Tokenizers"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_pipelines", line):
|
||||
model_results[model]["failed"]["Pipelines"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Pipelines"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("test_trainer", line):
|
||||
model_results[model]["failed"]["Trainer"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Trainer"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("onnx", line):
|
||||
model_results[model]["failed"]["ONNX"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["ONNX"][artifact_gpu] += 1
|
||||
|
||||
elif re.search("auto", line):
|
||||
model_results[model]["failed"]["Auto"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Auto"][artifact_gpu] += 1
|
||||
|
||||
else:
|
||||
model_results[model]["failed"]["Unclassified"][artifact_gpu] += 1
|
||||
matrix_job_results[matrix_name]["failed"]["Unclassified"][artifact_gpu] += 1
|
||||
unclassified_model_failures.append(line)
|
||||
|
||||
# Additional runs
|
||||
@ -1315,20 +1383,10 @@ if __name__ == "__main__":
|
||||
if "workflow_run" in event_payload:
|
||||
is_scheduled_ci_run = event_payload["workflow_run"]["event"] == "schedule"
|
||||
|
||||
# The values are used as the file names where to save the corresponding CI job results.
|
||||
test_to_result_name = {
|
||||
"Models": "model",
|
||||
"Trainer & FSDP": "trainer_and_fsdp",
|
||||
"PyTorch pipelines": "torch_pipeline",
|
||||
"TensorFlow pipelines": "tf_pipeline",
|
||||
"Examples directory": "example",
|
||||
"DeepSpeed": "deepspeed",
|
||||
}
|
||||
|
||||
test_name_and_result_pairs = []
|
||||
if len(model_results) > 0:
|
||||
if len(matrix_job_results) > 0:
|
||||
test_name = job_to_test_map[job_name]
|
||||
test_name_and_result_pairs.append((test_name, model_results))
|
||||
test_name_and_result_pairs.append((test_name, matrix_job_results))
|
||||
|
||||
for test_name, result in additional_results.items():
|
||||
test_name_and_result_pairs.append((test_name, result))
|
||||
@ -1346,8 +1404,8 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
# Let's create a file contain job --> job link
|
||||
if len(model_results) > 0:
|
||||
target_results = model_results
|
||||
if len(matrix_job_results) > 0:
|
||||
target_results = matrix_job_results
|
||||
else:
|
||||
target_results = additional_results[job_to_test_map[job_name]]
|
||||
|
||||
@ -1360,6 +1418,8 @@ if __name__ == "__main__":
|
||||
for job, job_result in sorted_dict:
|
||||
if job.startswith("models_"):
|
||||
job = job[len("models_") :]
|
||||
elif job.startswith("quantization_"):
|
||||
job = job[len("quantization_") :]
|
||||
job_links[job] = job_result["job_link"]
|
||||
|
||||
with open(f"ci_results_{job_name}/job_links.json", "w", encoding="UTF-8") as fp:
|
||||
@ -1424,7 +1484,7 @@ if __name__ == "__main__":
|
||||
message = Message(
|
||||
title,
|
||||
ci_title,
|
||||
model_results,
|
||||
matrix_job_results,
|
||||
additional_results,
|
||||
selected_warnings=selected_warnings,
|
||||
prev_ci_artifacts=prev_ci_artifacts,
|
||||
|
@ -1,294 +0,0 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ast
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict
|
||||
|
||||
from get_ci_error_statistics import get_jobs
|
||||
from get_previous_daily_ci import get_last_daily_ci_run
|
||||
from huggingface_hub import HfApi
|
||||
from notification_service import (
|
||||
Message,
|
||||
handle_stacktraces,
|
||||
handle_test_results,
|
||||
prepare_reports,
|
||||
retrieve_artifact,
|
||||
retrieve_available_artifacts,
|
||||
)
|
||||
from slack_sdk import WebClient
|
||||
|
||||
|
||||
api = HfApi()
|
||||
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
|
||||
|
||||
|
||||
class QuantizationMessage(Message):
|
||||
def __init__(
|
||||
self,
|
||||
title: str,
|
||||
results: Dict,
|
||||
):
|
||||
self.title = title
|
||||
|
||||
# Failures and success of the modeling tests
|
||||
self.n_success = sum(r["success"] for r in results.values())
|
||||
self.single_gpu_failures = sum(r["failed"]["single"] for r in results.values())
|
||||
self.multi_gpu_failures = sum(r["failed"]["multi"] for r in results.values())
|
||||
self.n_failures = self.single_gpu_failures + self.multi_gpu_failures
|
||||
|
||||
self.n_tests = self.n_failures + self.n_success
|
||||
self.results = results
|
||||
self.thread_ts = None
|
||||
|
||||
@property
|
||||
def payload(self) -> str:
|
||||
blocks = [self.header]
|
||||
|
||||
if self.n_failures > 0:
|
||||
blocks.append(self.failures_overwiew)
|
||||
blocks.append(self.failures_detailed)
|
||||
|
||||
if self.n_failures == 0:
|
||||
blocks.append(self.no_failures)
|
||||
|
||||
return json.dumps(blocks)
|
||||
|
||||
@property
|
||||
def time(self) -> str:
|
||||
all_results = self.results.values()
|
||||
time_spent = []
|
||||
for r in all_results:
|
||||
if len(r["time_spent"]):
|
||||
time_spent.extend([x for x in r["time_spent"].split(", ") if len(x.strip())])
|
||||
total_secs = 0
|
||||
|
||||
for time in time_spent:
|
||||
time_parts = time.split(":")
|
||||
|
||||
# Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute.
|
||||
if len(time_parts) == 1:
|
||||
time_parts = [0, 0, time_parts[0]]
|
||||
|
||||
hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2])
|
||||
total_secs += hours * 3600 + minutes * 60 + seconds
|
||||
|
||||
hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60
|
||||
return f"{int(hours)}h{int(minutes)}m{int(seconds)}s"
|
||||
|
||||
@property
|
||||
def failures_overwiew(self) -> Dict:
|
||||
return {
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "plain_text",
|
||||
"text": (
|
||||
f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
|
||||
f"The suite ran in {self.time}."
|
||||
),
|
||||
"emoji": True,
|
||||
},
|
||||
"accessory": {
|
||||
"type": "button",
|
||||
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
|
||||
"url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}",
|
||||
},
|
||||
}
|
||||
|
||||
@property
|
||||
def failures_detailed(self) -> Dict:
|
||||
failures = {k: v["failed"] for k, v in self.results.items()}
|
||||
|
||||
individual_reports = []
|
||||
for key, value in failures.items():
|
||||
device_report = self.get_device_report(value)
|
||||
if sum(value.values()):
|
||||
report = f"{device_report}{key}"
|
||||
individual_reports.append(report)
|
||||
|
||||
header = "Single | Multi | Category\n"
|
||||
failures_report = prepare_reports(
|
||||
title="The following quantization tests had failures", header=header, reports=individual_reports
|
||||
)
|
||||
|
||||
return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}}
|
||||
|
||||
def post(self):
|
||||
payload = self.payload
|
||||
print("Sending the following payload")
|
||||
print(json.dumps({"blocks": json.loads(payload)}))
|
||||
|
||||
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
|
||||
|
||||
self.thread_ts = client.chat_postMessage(
|
||||
channel=SLACK_REPORT_CHANNEL_ID,
|
||||
blocks=payload,
|
||||
text=text,
|
||||
)
|
||||
|
||||
def post_reply(self):
|
||||
if self.thread_ts is None:
|
||||
raise ValueError("Can only post reply if a post has been made.")
|
||||
|
||||
for job, job_result in self.results.items():
|
||||
if len(job_result["failures"]):
|
||||
for device, failures in job_result["failures"].items():
|
||||
blocks = self.get_reply_blocks(
|
||||
job,
|
||||
job_result,
|
||||
failures,
|
||||
device,
|
||||
text=f"Number of failures: {job_result['failed'][device]}",
|
||||
)
|
||||
|
||||
print("Sending the following reply")
|
||||
print(json.dumps({"blocks": blocks}))
|
||||
|
||||
client.chat_postMessage(
|
||||
channel="#transformers-ci-daily-quantization",
|
||||
text=f"Results for {job}",
|
||||
blocks=blocks,
|
||||
thread_ts=self.thread_ts["ts"],
|
||||
)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup_status = os.environ.get("SETUP_STATUS")
|
||||
SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"]
|
||||
setup_failed = True if setup_status is not None and setup_status != "success" else False
|
||||
|
||||
# This env. variable is set in workflow file (under the job `send_results`).
|
||||
ci_event = os.environ["CI_EVENT"]
|
||||
|
||||
title = f"🤗 Results of the {ci_event} - {os.getenv('CI_TEST_JOB')}."
|
||||
|
||||
if setup_failed:
|
||||
Message.error_out(
|
||||
title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=setup_failed
|
||||
)
|
||||
exit(0)
|
||||
|
||||
arguments = sys.argv[1:][0]
|
||||
try:
|
||||
quantization_matrix = ast.literal_eval(arguments)
|
||||
# Need to change from elements like `quantization/bnb` to `quantization_bnb` (the ones used as artifact names).
|
||||
quantization_matrix = [x.replace("quantization/", "quantization_") for x in quantization_matrix]
|
||||
except SyntaxError:
|
||||
Message.error_out(title, ci_title="")
|
||||
raise ValueError("Errored out.")
|
||||
|
||||
available_artifacts = retrieve_available_artifacts()
|
||||
|
||||
quantization_results = {
|
||||
quant: {
|
||||
"failed": {"single": 0, "multi": 0},
|
||||
"success": 0,
|
||||
"time_spent": "",
|
||||
"failures": {},
|
||||
"job_link": {},
|
||||
}
|
||||
for quant in quantization_matrix
|
||||
if f"run_quantization_torch_gpu_{quant}_test_reports" in available_artifacts
|
||||
}
|
||||
|
||||
github_actions_jobs = get_jobs(
|
||||
workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||
)
|
||||
github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs}
|
||||
|
||||
artifact_name_to_job_map = {}
|
||||
for job in github_actions_jobs:
|
||||
for step in job["steps"]:
|
||||
if step["name"].startswith("Test suite reports artifacts: "):
|
||||
artifact_name = step["name"][len("Test suite reports artifacts: ") :]
|
||||
artifact_name_to_job_map[artifact_name] = job
|
||||
break
|
||||
|
||||
for quant in quantization_results.keys():
|
||||
for artifact_path in available_artifacts[f"run_quantization_torch_gpu_{quant}_test_reports"].paths:
|
||||
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
||||
if "stats" in artifact:
|
||||
# Link to the GitHub Action job
|
||||
job = artifact_name_to_job_map[artifact_path["path"]]
|
||||
quantization_results[quant]["job_link"][artifact_path["gpu"]] = job["html_url"]
|
||||
failed, success, time_spent = handle_test_results(artifact["stats"])
|
||||
quantization_results[quant]["failed"][artifact_path["gpu"]] += failed
|
||||
quantization_results[quant]["success"] += success
|
||||
quantization_results[quant]["time_spent"] += time_spent[1:-1] + ", "
|
||||
|
||||
stacktraces = handle_stacktraces(artifact["failures_line"])
|
||||
|
||||
for line in artifact["summary_short"].split("\n"):
|
||||
if line.startswith("FAILED "):
|
||||
line = line[len("FAILED ") :]
|
||||
line = line.split()[0].replace("\n", "")
|
||||
|
||||
if artifact_path["gpu"] not in quantization_results[quant]["failures"]:
|
||||
quantization_results[quant]["failures"][artifact_path["gpu"]] = []
|
||||
|
||||
quantization_results[quant]["failures"][artifact_path["gpu"]].append(
|
||||
{"line": line, "trace": stacktraces.pop(0)}
|
||||
)
|
||||
|
||||
job_name = os.getenv("CI_TEST_JOB")
|
||||
|
||||
# if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder`
|
||||
report_repo_subfolder = ""
|
||||
if os.getenv("GITHUB_EVENT_NAME") != "schedule":
|
||||
report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}"
|
||||
report_repo_subfolder = f"runs/{report_repo_subfolder}"
|
||||
|
||||
workflow_run = get_last_daily_ci_run(
|
||||
token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID")
|
||||
)
|
||||
workflow_run_created_time = workflow_run["created_at"]
|
||||
workflow_id = workflow_run["workflow_id"]
|
||||
|
||||
report_repo_folder = workflow_run_created_time.split("T")[0]
|
||||
|
||||
if report_repo_subfolder:
|
||||
report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}"
|
||||
|
||||
if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
|
||||
os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))
|
||||
|
||||
nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml"
|
||||
is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow)
|
||||
is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule"
|
||||
|
||||
with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp:
|
||||
json.dump(quantization_results, fp, indent=4, ensure_ascii=False)
|
||||
|
||||
report_repo_id = os.getenv("REPORT_REPO_ID")
|
||||
|
||||
# upload results to Hub dataset (only for the scheduled daily CI run on `main`)
|
||||
api.upload_file(
|
||||
path_or_fileobj=f"ci_results_{job_name}/quantization_results.json",
|
||||
path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/quantization_results.json",
|
||||
repo_id=report_repo_id,
|
||||
repo_type="dataset",
|
||||
token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None),
|
||||
)
|
||||
|
||||
message = QuantizationMessage(
|
||||
title,
|
||||
results=quantization_results,
|
||||
)
|
||||
|
||||
message.post()
|
||||
message.post_reply()
|
Loading…
Reference in New Issue
Block a user