diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 0997a1112ad..95584176d6c 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -18,6 +18,10 @@ on: docker: required: true type: string + report_name_prefix: + required: false + default: run_models_gpu + type: string env: HF_HOME: /mnt/cache @@ -116,23 +120,23 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt - name: Run test shell: bash run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports" - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 75ea3bb24bc..8589f4a810b 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -54,12 +54,23 @@ jobs: ci_event: Daily CI secrets: inherit + trainer-fsdp-ci: + name: Trainer/FSDP CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_trainer_and_fsdp_gpu + slack_report_channel: "#transformers-ci-daily-training" + runner: daily-ci + docker: huggingface/transformers-all-latest-gpu + ci_event: Daily CI + secrets: inherit + deepspeed-ci: name: DeepSpeed CI uses: ./.github/workflows/self-scheduled.yml with: job: run_torch_cuda_extensions_gpu - slack_report_channel: "#transformers-ci-daily-deepspeed" + slack_report_channel: "#transformers-ci-daily-training" runner: daily-ci docker: huggingface/transformers-pytorch-deepspeed-latest-gpu ci_event: Daily CI diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index dead87b5b6e..7fce6d60800 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -45,7 +45,7 @@ env: jobs: setup: - if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job) + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job) name: Setup strategy: matrix: @@ -77,12 +77,17 @@ jobs: run: pip freeze - id: set-matrix - if: ${{ inputs.job == 'run_models_gpu' }} + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job) name: Identify models to test working-directory: /transformers/tests run: | - echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT - echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + if [ "${{ inputs.job }}" = "run_models_gpu" ]; then + echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then + echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT + echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT + fi - id: set-matrix-quantization if: ${{ inputs.job == 'run_quantization_torch_gpu' }} @@ -113,6 +118,25 @@ jobs: docker: ${{ inputs.docker }} secrets: inherit + run_trainer_and_fsdp_gpu: + if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }} + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + slice_id: [0, 1] + uses: ./.github/workflows/model_jobs.yml + with: + folder_slices: ${{ needs.setup.outputs.folder_slices }} + machine_type: ${{ matrix.machine_type }} + slice_id: ${{ matrix.slice_id }} + runner: ${{ inputs.runner }} + docker: ${{ inputs.docker }} + report_name_prefix: run_trainer_and_fsdp_gpu + secrets: inherit + run_pipelines_torch_gpu: if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} name: PyTorch pipelines @@ -336,10 +360,6 @@ jobs: working-directory: ${{ inputs.working-directory-prefix }}/transformers run: git fetch && git checkout ${{ github.sha }} - # TODO: update the docker image instead - - name: Reinstall some packages with specific versions - run: python3 -m pip install numpy==1.24.3 numba==0.61.0 scipy==1.12.0 scikit-learn==1.6.1 - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: ${{ inputs.working-directory-prefix }}/transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . @@ -545,6 +565,7 @@ jobs: needs: [ setup, run_models_gpu, + run_trainer_and_fsdp_gpu, run_pipelines_torch_gpu, run_pipelines_tf_gpu, run_examples_gpu, diff --git a/utils/notification_service.py b/utils/notification_service.py index 66db34e00c2..dd01b082f4a 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -942,7 +942,6 @@ if __name__ == "__main__": # To find the PR number in a commit title, for example, `Add AwesomeFormer model (#99999)` pr_number_re = re.compile(r"\(#(\d+)\)$") - title = f"🤗 Results of {ci_event} - {os.getenv('CI_TEST_JOB')}." # Add Commit/PR title with a link for push CI # (check the title in 2 env. variables - depending on the CI is triggered via `push` or `workflow_run` event) ci_title_push = os.environ.get("CI_TITLE_PUSH") @@ -994,6 +993,8 @@ if __name__ == "__main__": else: ci_title = "" + # `title` will be updated at the end before calling `Message()`. + title = f"🤗 Results of {ci_event}" if runner_not_available or runner_failed or setup_failed: Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed) exit(0) @@ -1041,6 +1042,11 @@ if __name__ == "__main__": "Unclassified", ] + job_name = os.getenv("CI_TEST_JOB") + report_name_prefix = "run_models_gpu" + if job_name == "run_trainer_and_fsdp_gpu": + report_name_prefix = job_name + # This dict will contain all the information relative to each model: # - Failures: the total, as well as the number of failures per-category defined above # - Success: total @@ -1055,13 +1061,13 @@ if __name__ == "__main__": "job_link": {}, } for model in models - if f"run_models_gpu_{model}_test_reports" in available_artifacts + if f"{report_name_prefix}_{model}_test_reports" in available_artifacts } unclassified_model_failures = [] for model in model_results.keys(): - for artifact_path in available_artifacts[f"run_models_gpu_{model}_test_reports"].paths: + for artifact_path in available_artifacts[f"{report_name_prefix}_{model}_test_reports"].paths: artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"]) if "stats" in artifact: # Link to the GitHub Action job @@ -1123,7 +1129,7 @@ if __name__ == "__main__": "PyTorch pipelines": "run_pipelines_torch_gpu_test_reports", "TensorFlow pipelines": "run_pipelines_tf_gpu_test_reports", "Examples directory": "run_examples_gpu_test_reports", - "Torch CUDA extension tests": "run_torch_cuda_extensions_gpu_test_reports", + "DeepSpeed": "run_torch_cuda_extensions_gpu_test_reports", } if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): @@ -1132,7 +1138,7 @@ if __name__ == "__main__": del additional_files["TensorFlow pipelines"] elif ci_event.startswith("Scheduled CI (AMD)"): del additional_files["TensorFlow pipelines"] - del additional_files["Torch CUDA extension tests"] + del additional_files["DeepSpeed"] elif ci_event.startswith("Push CI (AMD)"): additional_files = {} @@ -1143,12 +1149,11 @@ if __name__ == "__main__": "run_pipelines_torch_gpu": "PyTorch pipelines", "run_pipelines_tf_gpu": "TensorFlow pipelines", "run_examples_gpu": "Examples directory", - "run_torch_cuda_extensions_gpu": "Torch CUDA extension tests", + "run_torch_cuda_extensions_gpu": "DeepSpeed", } # Remove some entries in `additional_files` if they are not concerned. test_name = None - job_name = os.getenv("CI_TEST_JOB") if job_name in job_to_test_map: test_name = job_to_test_map[job_name] additional_files = {k: v for k, v in additional_files.items() if k == test_name} @@ -1243,7 +1248,7 @@ if __name__ == "__main__": "PyTorch pipelines": "torch_pipeline", "TensorFlow pipelines": "tf_pipeline", "Examples directory": "example", - "Torch CUDA extension tests": "deepspeed", + "DeepSpeed": "deepspeed", } for job, job_result in additional_results.items(): with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp: @@ -1270,6 +1275,19 @@ if __name__ == "__main__": artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"] ) + job_to_test_map.update( + { + "run_models_gpu": "Models", + "run_trainer_and_fsdp_gpu": "Trainer & FSDP", + } + ) + + ci_name_in_report = "" + if job_name in job_to_test_map: + ci_name_in_report = job_to_test_map[job_name] + + title = f"🤗 Results of {ci_event}: {ci_name_in_report}" + message = Message( title, ci_title,