mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 03:01:07 +06:00
Send trainer/fsdp/deepspeed CI job reports to a single channel (#37411)
* send trainer/fsdd/deepspeed channel * update * change name * no . * final --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
a2c2fb0108
commit
4f139f5a50
20
.github/workflows/model_jobs.yml
vendored
20
.github/workflows/model_jobs.yml
vendored
@ -18,6 +18,10 @@ on:
|
|||||||
docker:
|
docker:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
|
report_name_prefix:
|
||||||
|
required: false
|
||||||
|
default: run_models_gpu
|
||||||
|
type: string
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
@ -116,23 +120,23 @@ jobs:
|
|||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||||
|
|
||||||
- name: Run test
|
- name: Run test
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||||
echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||||
path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||||
|
13
.github/workflows/self-scheduled-caller.yml
vendored
13
.github/workflows/self-scheduled-caller.yml
vendored
@ -54,12 +54,23 @@ jobs:
|
|||||||
ci_event: Daily CI
|
ci_event: Daily CI
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
trainer-fsdp-ci:
|
||||||
|
name: Trainer/FSDP CI
|
||||||
|
uses: ./.github/workflows/self-scheduled.yml
|
||||||
|
with:
|
||||||
|
job: run_trainer_and_fsdp_gpu
|
||||||
|
slack_report_channel: "#transformers-ci-daily-training"
|
||||||
|
runner: daily-ci
|
||||||
|
docker: huggingface/transformers-all-latest-gpu
|
||||||
|
ci_event: Daily CI
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
deepspeed-ci:
|
deepspeed-ci:
|
||||||
name: DeepSpeed CI
|
name: DeepSpeed CI
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
uses: ./.github/workflows/self-scheduled.yml
|
||||||
with:
|
with:
|
||||||
job: run_torch_cuda_extensions_gpu
|
job: run_torch_cuda_extensions_gpu
|
||||||
slack_report_channel: "#transformers-ci-daily-deepspeed"
|
slack_report_channel: "#transformers-ci-daily-training"
|
||||||
runner: daily-ci
|
runner: daily-ci
|
||||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||||
ci_event: Daily CI
|
ci_event: Daily CI
|
||||||
|
33
.github/workflows/self-scheduled.yml
vendored
33
.github/workflows/self-scheduled.yml
vendored
@ -45,7 +45,7 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
setup:
|
setup:
|
||||||
if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
||||||
name: Setup
|
name: Setup
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -77,12 +77,17 @@ jobs:
|
|||||||
run: pip freeze
|
run: pip freeze
|
||||||
|
|
||||||
- id: set-matrix
|
- id: set-matrix
|
||||||
if: ${{ inputs.job == 'run_models_gpu' }}
|
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||||
name: Identify models to test
|
name: Identify models to test
|
||||||
working-directory: /transformers/tests
|
working-directory: /transformers/tests
|
||||||
run: |
|
run: |
|
||||||
|
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||||
|
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||||
|
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
- id: set-matrix-quantization
|
- id: set-matrix-quantization
|
||||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||||
@ -113,6 +118,25 @@ jobs:
|
|||||||
docker: ${{ inputs.docker }}
|
docker: ${{ inputs.docker }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
run_trainer_and_fsdp_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
||||||
|
name: " "
|
||||||
|
needs: setup
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
|
slice_id: [0, 1]
|
||||||
|
uses: ./.github/workflows/model_jobs.yml
|
||||||
|
with:
|
||||||
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
|
machine_type: ${{ matrix.machine_type }}
|
||||||
|
slice_id: ${{ matrix.slice_id }}
|
||||||
|
runner: ${{ inputs.runner }}
|
||||||
|
docker: ${{ inputs.docker }}
|
||||||
|
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
run_pipelines_torch_gpu:
|
run_pipelines_torch_gpu:
|
||||||
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
|
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
|
||||||
name: PyTorch pipelines
|
name: PyTorch pipelines
|
||||||
@ -336,10 +360,6 @@ jobs:
|
|||||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||||
run: git fetch && git checkout ${{ github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
# TODO: update the docker image instead
|
|
||||||
- name: Reinstall some packages with specific versions
|
|
||||||
run: python3 -m pip install numpy==1.24.3 numba==0.61.0 scipy==1.12.0 scikit-learn==1.6.1
|
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||||
@ -545,6 +565,7 @@ jobs:
|
|||||||
needs: [
|
needs: [
|
||||||
setup,
|
setup,
|
||||||
run_models_gpu,
|
run_models_gpu,
|
||||||
|
run_trainer_and_fsdp_gpu,
|
||||||
run_pipelines_torch_gpu,
|
run_pipelines_torch_gpu,
|
||||||
run_pipelines_tf_gpu,
|
run_pipelines_tf_gpu,
|
||||||
run_examples_gpu,
|
run_examples_gpu,
|
||||||
|
@ -942,7 +942,6 @@ if __name__ == "__main__":
|
|||||||
# To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)`
|
# To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)`
|
||||||
pr_number_re = re.compile(r"\(#(\d+)\)$")
|
pr_number_re = re.compile(r"\(#(\d+)\)$")
|
||||||
|
|
||||||
title = f"🤗 Results of {ci_event} - {os.getenv('CI_TEST_JOB')}."
|
|
||||||
# Add Commit/PR title with a link for push CI
|
# Add Commit/PR title with a link for push CI
|
||||||
# (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event)
|
# (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event)
|
||||||
ci_title_push = os.environ.get("CI_TITLE_PUSH")
|
ci_title_push = os.environ.get("CI_TITLE_PUSH")
|
||||||
@ -994,6 +993,8 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
ci_title = ""
|
ci_title = ""
|
||||||
|
|
||||||
|
# `title` will be updated at the end before calling `Message()`.
|
||||||
|
title = f"🤗 Results of {ci_event}"
|
||||||
if runner_not_available or runner_failed or setup_failed:
|
if runner_not_available or runner_failed or setup_failed:
|
||||||
Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed)
|
Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed)
|
||||||
exit(0)
|
exit(0)
|
||||||
@ -1041,6 +1042,11 @@ if __name__ == "__main__":
|
|||||||
"Unclassified",
|
"Unclassified",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
job_name = os.getenv("CI_TEST_JOB")
|
||||||
|
report_name_prefix = "run_models_gpu"
|
||||||
|
if job_name == "run_trainer_and_fsdp_gpu":
|
||||||
|
report_name_prefix = job_name
|
||||||
|
|
||||||
# This dict will contain all the information relative to each model:
|
# This dict will contain all the information relative to each model:
|
||||||
# - Failures: the total, as well as the number of failures per-category defined above
|
# - Failures: the total, as well as the number of failures per-category defined above
|
||||||
# - Success: total
|
# - Success: total
|
||||||
@ -1055,13 +1061,13 @@ if __name__ == "__main__":
|
|||||||
"job_link": {},
|
"job_link": {},
|
||||||
}
|
}
|
||||||
for model in models
|
for model in models
|
||||||
if f"run_models_gpu_{model}_test_reports" in available_artifacts
|
if f"{report_name_prefix}_{model}_test_reports" in available_artifacts
|
||||||
}
|
}
|
||||||
|
|
||||||
unclassified_model_failures = []
|
unclassified_model_failures = []
|
||||||
|
|
||||||
for model in model_results.keys():
|
for model in model_results.keys():
|
||||||
for artifact_path in available_artifacts[f"run_models_gpu_{model}_test_reports"].paths:
|
for artifact_path in available_artifacts[f"{report_name_prefix}_{model}_test_reports"].paths:
|
||||||
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
|
||||||
if "stats" in artifact:
|
if "stats" in artifact:
|
||||||
# Link to the GitHub Action job
|
# Link to the GitHub Action job
|
||||||
@ -1123,7 +1129,7 @@ if __name__ == "__main__":
|
|||||||
"PyTorch pipelines": "run_pipelines_torch_gpu_test_reports",
|
"PyTorch pipelines": "run_pipelines_torch_gpu_test_reports",
|
||||||
"TensorFlow pipelines": "run_pipelines_tf_gpu_test_reports",
|
"TensorFlow pipelines": "run_pipelines_tf_gpu_test_reports",
|
||||||
"Examples directory": "run_examples_gpu_test_reports",
|
"Examples directory": "run_examples_gpu_test_reports",
|
||||||
"Torch CUDA extension tests": "run_torch_cuda_extensions_gpu_test_reports",
|
"DeepSpeed": "run_torch_cuda_extensions_gpu_test_reports",
|
||||||
}
|
}
|
||||||
|
|
||||||
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
|
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
|
||||||
@ -1132,7 +1138,7 @@ if __name__ == "__main__":
|
|||||||
del additional_files["TensorFlow pipelines"]
|
del additional_files["TensorFlow pipelines"]
|
||||||
elif ci_event.startswith("Scheduled CI (AMD)"):
|
elif ci_event.startswith("Scheduled CI (AMD)"):
|
||||||
del additional_files["TensorFlow pipelines"]
|
del additional_files["TensorFlow pipelines"]
|
||||||
del additional_files["Torch CUDA extension tests"]
|
del additional_files["DeepSpeed"]
|
||||||
elif ci_event.startswith("Push CI (AMD)"):
|
elif ci_event.startswith("Push CI (AMD)"):
|
||||||
additional_files = {}
|
additional_files = {}
|
||||||
|
|
||||||
@ -1143,12 +1149,11 @@ if __name__ == "__main__":
|
|||||||
"run_pipelines_torch_gpu": "PyTorch pipelines",
|
"run_pipelines_torch_gpu": "PyTorch pipelines",
|
||||||
"run_pipelines_tf_gpu": "TensorFlow pipelines",
|
"run_pipelines_tf_gpu": "TensorFlow pipelines",
|
||||||
"run_examples_gpu": "Examples directory",
|
"run_examples_gpu": "Examples directory",
|
||||||
"run_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
|
"run_torch_cuda_extensions_gpu": "DeepSpeed",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Remove some entries in `additional_files` if they are not concerned.
|
# Remove some entries in `additional_files` if they are not concerned.
|
||||||
test_name = None
|
test_name = None
|
||||||
job_name = os.getenv("CI_TEST_JOB")
|
|
||||||
if job_name in job_to_test_map:
|
if job_name in job_to_test_map:
|
||||||
test_name = job_to_test_map[job_name]
|
test_name = job_to_test_map[job_name]
|
||||||
additional_files = {k: v for k, v in additional_files.items() if k == test_name}
|
additional_files = {k: v for k, v in additional_files.items() if k == test_name}
|
||||||
@ -1243,7 +1248,7 @@ if __name__ == "__main__":
|
|||||||
"PyTorch pipelines": "torch_pipeline",
|
"PyTorch pipelines": "torch_pipeline",
|
||||||
"TensorFlow pipelines": "tf_pipeline",
|
"TensorFlow pipelines": "tf_pipeline",
|
||||||
"Examples directory": "example",
|
"Examples directory": "example",
|
||||||
"Torch CUDA extension tests": "deepspeed",
|
"DeepSpeed": "deepspeed",
|
||||||
}
|
}
|
||||||
for job, job_result in additional_results.items():
|
for job, job_result in additional_results.items():
|
||||||
with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp:
|
with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp:
|
||||||
@ -1270,6 +1275,19 @@ if __name__ == "__main__":
|
|||||||
artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
job_to_test_map.update(
|
||||||
|
{
|
||||||
|
"run_models_gpu": "Models",
|
||||||
|
"run_trainer_and_fsdp_gpu": "Trainer & FSDP",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
ci_name_in_report = ""
|
||||||
|
if job_name in job_to_test_map:
|
||||||
|
ci_name_in_report = job_to_test_map[job_name]
|
||||||
|
|
||||||
|
title = f"🤗 Results of {ci_event}: {ci_name_in_report}"
|
||||||
|
|
||||||
message = Message(
|
message = Message(
|
||||||
title,
|
title,
|
||||||
ci_title,
|
ci_title,
|
||||||
|
Loading…
Reference in New Issue
Block a user