mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
CI: update to ROCm 6.0.2 and test MI300 (#30266)
* update to ROCm 6.0.2 and test MI300 * add callers for mi300 * update dockerfile * fix trainer tests * remove apex * style * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * update to torch 2.3 * add workflow dispatch target * we may need branches: mi300-ci after all * nit * fix docker build * nit * add check runner * remove docker-gpu * fix issues * fix --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
539ed75d50
commit
37bba2a32d
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
name: Self-hosted runner (AMD mi300 CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: ["Self-hosted runner (push-caller)"]
|
||||||
|
branches: ["main"]
|
||||||
|
types: [completed]
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- run_amd_push_ci_caller*
|
||||||
|
paths:
|
||||||
|
- "src/**"
|
||||||
|
- "tests/**"
|
||||||
|
- ".github/**"
|
||||||
|
- "templates/**"
|
||||||
|
- "utils/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_amd_ci:
|
||||||
|
name: AMD mi300
|
||||||
|
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
|
||||||
|
uses: ./.github/workflows/self-push-amd.yml
|
||||||
|
with:
|
||||||
|
gpu_flavor: mi300
|
||||||
|
secrets: inherit
|
8
.github/workflows/self-push-amd.yml
vendored
8
.github/workflows/self-push-amd.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -57,7 +57,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -155,7 +155,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -230,7 +230,7 @@ jobs:
|
|||||||
- name: Run all non-slow selected tests on GPU
|
- name: Run all non-slow selected tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
|
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
|
@ -16,4 +16,5 @@ jobs:
|
|||||||
uses: ./.github/workflows/self-scheduled-amd.yml
|
uses: ./.github/workflows/self-scheduled-amd.yml
|
||||||
with:
|
with:
|
||||||
gpu_flavor: mi210
|
gpu_flavor: mi210
|
||||||
|
slack_report_channel: "#transformers-ci-daily-amd"
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
@ -16,4 +16,5 @@ jobs:
|
|||||||
uses: ./.github/workflows/self-scheduled-amd.yml
|
uses: ./.github/workflows/self-scheduled-amd.yml
|
||||||
with:
|
with:
|
||||||
gpu_flavor: mi250
|
gpu_flavor: mi250
|
||||||
|
slack_report_channel: "#transformers-ci-daily-amd"
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
21
.github/workflows/self-scheduled-amd-mi300-caller.yml
vendored
Normal file
21
.github/workflows/self-scheduled-amd-mi300-caller.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
name: Self-hosted runner (AMD mi300 scheduled CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
|
||||||
|
branches: ["main"]
|
||||||
|
types: [completed]
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- run_amd_scheduled_ci_caller*
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_amd_ci:
|
||||||
|
name: AMD mi300
|
||||||
|
needs: build-docker-containers
|
||||||
|
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
|
||||||
|
uses: ./.github/workflows/self-scheduled-amd.yml
|
||||||
|
with:
|
||||||
|
gpu_flavor: mi300
|
||||||
|
slack_report_channel: "#transformers-ci-daily-amd"
|
||||||
|
secrets: inherit
|
26
.github/workflows/self-scheduled-amd.yml
vendored
26
.github/workflows/self-scheduled-amd.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
|||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
|
|
||||||
- name: Check Runner Status
|
- name: Check Runner Status
|
||||||
run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
|
|
||||||
check_runners:
|
check_runners:
|
||||||
name: Check Runners
|
name: Check Runners
|
||||||
@ -42,7 +42,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -63,7 +63,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -116,7 +116,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
machine_type: [single-gpu]
|
machine_type: [single-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -162,7 +162,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
@ -184,7 +184,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
machine_type: [multi-gpu]
|
machine_type: [multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -230,7 +230,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
@ -250,7 +250,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu]
|
machine_type: [single-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -287,7 +287,7 @@ jobs:
|
|||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/pytorch/_tests_requirements.txt
|
pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
|
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
@ -307,7 +307,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu
|
image: huggingface/transformers-pytorch-amd-gpu
|
||||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -343,7 +343,7 @@ jobs:
|
|||||||
- name: Run all pipeline tests on GPU
|
- name: Run all pipeline tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
|
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
@ -364,7 +364,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
|
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
needs: setup
|
needs: setup
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
image: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||||
@ -400,7 +400,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
|
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
|
@ -1,24 +1,19 @@
|
|||||||
FROM rocm/dev-ubuntu-20.04:5.6
|
FROM rocm/dev-ubuntu-22.04:6.0.2
|
||||||
# rocm/pytorch has no version with 2.1.0
|
# rocm/pytorch has no version with 2.1.0
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG PYTORCH='2.1.0'
|
|
||||||
ARG TORCH_VISION='0.16.0'
|
|
||||||
ARG TORCH_AUDIO='2.1.0'
|
|
||||||
ARG ROCM='5.6'
|
|
||||||
|
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \
|
apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg && \
|
||||||
apt clean && \
|
apt clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
|
||||||
|
|
||||||
RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
|
RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
|
RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
|
||||||
|
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
@ -35,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax
|
|||||||
# this line must be added in order for python to be aware of transformers.
|
# this line must be added in order for python to be aware of transformers.
|
||||||
RUN cd transformers && python3 setup.py develop
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
|
||||||
# Remove nvml as it is not compatible with ROCm
|
# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either.
|
||||||
RUN python3 -m pip uninstall py3nvml pynvml -y
|
RUN python3 -m pip uninstall py3nvml pynvml apex -y
|
||||||
|
@ -94,7 +94,7 @@ We strongly suggest referring to the detailed [installation instructions](https:
|
|||||||
</hfoption>
|
</hfoption>
|
||||||
<hfoption id="AMD">
|
<hfoption id="AMD">
|
||||||
|
|
||||||
FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210** and **Instinct MI250**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
|
FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210**, **Instinct MI250** and **Instinct MI300**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
|
||||||
|
|
||||||
</hfoption>
|
</hfoption>
|
||||||
</hfoptions>
|
</hfoptions>
|
||||||
|
@ -1545,6 +1545,11 @@ class CodeCarbonCallback(TrainerCallback):
|
|||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"CodeCarbonCallback requires `codecarbon` to be installed. Run `pip install codecarbon`."
|
"CodeCarbonCallback requires `codecarbon` to be installed. Run `pip install codecarbon`."
|
||||||
)
|
)
|
||||||
|
elif torch.version.hip:
|
||||||
|
raise RuntimeError(
|
||||||
|
"CodeCarbonCallback requires `codecarbon` package, which is not compatible with AMD ROCm (https://github.com/mlco2/codecarbon/pull/490). When using the Trainer, please specify the `report_to` argument (https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.report_to) to disable CodeCarbonCallback."
|
||||||
|
)
|
||||||
|
|
||||||
import codecarbon
|
import codecarbon
|
||||||
|
|
||||||
self._codecarbon = codecarbon
|
self._codecarbon = codecarbon
|
||||||
|
@ -1735,6 +1735,13 @@ class TrainingArguments:
|
|||||||
from .integrations import get_available_reporting_integrations
|
from .integrations import get_available_reporting_integrations
|
||||||
|
|
||||||
self.report_to = get_available_reporting_integrations()
|
self.report_to = get_available_reporting_integrations()
|
||||||
|
|
||||||
|
if "codecarbon" in self.report_to and torch.version.hip:
|
||||||
|
logger.warning(
|
||||||
|
"When using the Trainer, CodeCarbonCallback requires the `codecarbon` package, which is not compatible with AMD ROCm (https://github.com/mlco2/codecarbon/pull/490). Automatically disabling the codecarbon callback. Reference: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.report_to."
|
||||||
|
)
|
||||||
|
self.report_to.remove("codecarbon")
|
||||||
|
|
||||||
elif self.report_to == "none" or self.report_to == ["none"]:
|
elif self.report_to == "none" or self.report_to == ["none"]:
|
||||||
self.report_to = []
|
self.report_to = []
|
||||||
elif not isinstance(self.report_to, list):
|
elif not isinstance(self.report_to, list):
|
||||||
|
@ -301,6 +301,7 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
--label_smoothing_factor 0.1
|
--label_smoothing_factor 0.1
|
||||||
--target_lang ro_RO
|
--target_lang ro_RO
|
||||||
--source_lang en_XX
|
--source_lang en_XX
|
||||||
|
--report_to none
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
args_eval = f"""
|
args_eval = f"""
|
||||||
|
@ -607,7 +607,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
# Base training. Should have the same results as test_reproducible_training
|
# Base training. Should have the same results as test_reproducible_training
|
||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
args = TrainingArguments("./regression", learning_rate=0.1)
|
args = TrainingArguments("./regression", learning_rate=0.1, report_to="none")
|
||||||
trainer = Trainer(model, args, train_dataset=train_dataset)
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_trained_model(trainer.model)
|
self.check_trained_model(trainer.model)
|
||||||
@ -629,7 +629,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
def test_model_init(self):
|
def test_model_init(self):
|
||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
args = TrainingArguments("./regression", learning_rate=0.1)
|
args = TrainingArguments("./regression", learning_rate=0.1, report_to="none")
|
||||||
trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
|
trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_trained_model(trainer.model)
|
self.check_trained_model(trainer.model)
|
||||||
@ -692,7 +692,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
def test_custom_optimizer(self):
|
def test_custom_optimizer(self):
|
||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
args = TrainingArguments("./regression")
|
args = TrainingArguments("./regression", report_to="none")
|
||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
|
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
|
||||||
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
|
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
|
||||||
@ -716,6 +716,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
lr_scheduler_kwargs=extra_kwargs,
|
lr_scheduler_kwargs=extra_kwargs,
|
||||||
learning_rate=0.2,
|
learning_rate=0.2,
|
||||||
warmup_steps=num_warmup_steps,
|
warmup_steps=num_warmup_steps,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(model, args, train_dataset=train_dataset)
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
||||||
trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
|
trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
|
||||||
@ -742,6 +743,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
lr_scheduler_kwargs=extra_kwargs,
|
lr_scheduler_kwargs=extra_kwargs,
|
||||||
learning_rate=0.2,
|
learning_rate=0.2,
|
||||||
warmup_steps=num_warmup_steps,
|
warmup_steps=num_warmup_steps,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(model, args, train_dataset=train_dataset)
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
||||||
trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
|
trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
|
||||||
@ -762,6 +764,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
"./regression",
|
"./regression",
|
||||||
eval_strategy="epoch",
|
eval_strategy="epoch",
|
||||||
metric_for_best_model="eval_loss",
|
metric_for_best_model="eval_loss",
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
|
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
|
||||||
@ -796,6 +799,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
metric_for_best_model="eval_loss",
|
metric_for_best_model="eval_loss",
|
||||||
num_train_epochs=10,
|
num_train_epochs=10,
|
||||||
learning_rate=0.2,
|
learning_rate=0.2,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
||||||
@ -828,7 +832,7 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
from transformers.optimization import Adafactor, AdafactorSchedule
|
from transformers.optimization import Adafactor, AdafactorSchedule
|
||||||
|
|
||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
args = TrainingArguments("./regression")
|
args = TrainingArguments("./regression", report_to="none")
|
||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
|
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
|
||||||
lr_scheduler = AdafactorSchedule(optimizer)
|
lr_scheduler = AdafactorSchedule(optimizer)
|
||||||
@ -879,7 +883,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
eval_dataset = RegressionDataset()
|
eval_dataset = RegressionDataset()
|
||||||
model = RegressionDictModel()
|
model = RegressionDictModel()
|
||||||
args = TrainingArguments("./regression")
|
args = TrainingArguments("./regression", report_to="none")
|
||||||
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
_ = trainer.evaluate()
|
_ = trainer.evaluate()
|
||||||
@ -890,7 +894,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
eval_dataset = RepeatDataset(x)
|
eval_dataset = RepeatDataset(x)
|
||||||
args = TrainingArguments("./test")
|
args = TrainingArguments("./test", report_to="none")
|
||||||
trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
|
trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
|
||||||
# By default the past_key_values are removed
|
# By default the past_key_values are removed
|
||||||
result = trainer.predict(eval_dataset)
|
result = trainer.predict(eval_dataset)
|
||||||
@ -1100,7 +1104,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
"./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
|
"./test",
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
neftune_noise_alpha=0.4,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@ -1117,7 +1126,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
"./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
|
"./test",
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
neftune_noise_alpha=0.4,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@ -1143,13 +1157,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False)
|
args = TrainingArguments(
|
||||||
|
"./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
|
||||||
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
log_history_no_filter = trainer.state.log_history
|
log_history_no_filter = trainer.state.log_history
|
||||||
|
|
||||||
# Trainer with inf/nan filter
|
# Trainer with inf/nan filter
|
||||||
args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True)
|
args = TrainingArguments(
|
||||||
|
"./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
|
||||||
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
log_history_filter = trainer.state.log_history
|
log_history_filter = trainer.state.log_history
|
||||||
@ -1196,11 +1214,16 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# tests that we do not require dataloader to have a .dataset attribute
|
# tests that we do not require dataloader to have a .dataset attribute
|
||||||
def test_dataloader_without_dataset(self):
|
def test_dataloader_without_dataset(self):
|
||||||
train_dataset = RegressionDataset(length=128)
|
train_dataset = RegressionDataset(length=128)
|
||||||
trainer = CustomDataloaderTrainer(
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
model=RegressionModel(), train_dataset=train_dataset, eval_dataset=train_dataset
|
trainer = CustomDataloaderTrainer(
|
||||||
)
|
model=RegressionModel(),
|
||||||
trainer.train()
|
train_dataset=train_dataset,
|
||||||
trainer.evaluate()
|
eval_dataset=train_dataset,
|
||||||
|
args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
trainer.evaluate()
|
||||||
|
|
||||||
def test_galore_matched_modules(self):
|
def test_galore_matched_modules(self):
|
||||||
regex_patterns = [r".*.attn.*", r".*.mlp.*"]
|
regex_patterns = [r".*.attn.*", r".*.mlp.*"]
|
||||||
@ -1495,7 +1518,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# Make the Trainer believe it's a parallelized model
|
# Make the Trainer believe it's a parallelized model
|
||||||
model.is_parallelizable = True
|
model.is_parallelizable = True
|
||||||
model.model_parallel = True
|
model.model_parallel = True
|
||||||
args = TrainingArguments("./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16)
|
args = TrainingArguments(
|
||||||
|
"./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none"
|
||||||
|
)
|
||||||
trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
|
trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
|
||||||
# Check the Trainer was fooled
|
# Check the Trainer was fooled
|
||||||
self.assertTrue(trainer.is_model_parallel)
|
self.assertTrue(trainer.is_model_parallel)
|
||||||
@ -1849,7 +1874,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
def test_dynamic_shapes(self):
|
def test_dynamic_shapes(self):
|
||||||
eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
|
eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
|
||||||
model = RegressionModel(a=2, b=1)
|
model = RegressionModel(a=2, b=1)
|
||||||
args = TrainingArguments("./regression")
|
args = TrainingArguments("./regression", report_to="none")
|
||||||
trainer = Trainer(model, args, eval_dataset=eval_dataset)
|
trainer = Trainer(model, args, eval_dataset=eval_dataset)
|
||||||
|
|
||||||
# Check evaluation can run to completion
|
# Check evaluation can run to completion
|
||||||
@ -1866,7 +1891,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
|
self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
|
||||||
|
|
||||||
# Same tests with eval accumulation
|
# Same tests with eval accumulation
|
||||||
args = TrainingArguments("./regression", eval_accumulation_steps=2)
|
args = TrainingArguments("./regression", eval_accumulation_steps=2, report_to="none")
|
||||||
trainer = Trainer(model, args, eval_dataset=eval_dataset)
|
trainer = Trainer(model, args, eval_dataset=eval_dataset)
|
||||||
|
|
||||||
# Check evaluation can run to completion
|
# Check evaluation can run to completion
|
||||||
@ -2984,13 +3009,14 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
def test_no_wd_param_group(self):
|
def test_no_wd_param_group(self):
|
||||||
model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
|
model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
|
||||||
trainer = Trainer(model=model)
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
trainer.create_optimizer_and_scheduler(10)
|
trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none"))
|
||||||
wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight'] # fmt: skip
|
trainer.create_optimizer_and_scheduler(10)
|
||||||
wd_params = [p for n, p in model.named_parameters() if n in wd_names]
|
wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight'] # fmt: skip
|
||||||
no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
|
wd_params = [p for n, p in model.named_parameters() if n in wd_names]
|
||||||
self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
|
no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
|
||||||
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
|
||||||
|
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
@ -4134,32 +4160,35 @@ class OptimizerAndModelInspectionTest(unittest.TestCase):
|
|||||||
# in_features * out_features + bias
|
# in_features * out_features + bias
|
||||||
layer_1 = 128 * 64 + 64
|
layer_1 = 128 * 64 + 64
|
||||||
layer_2 = 64 * 32 + 32
|
layer_2 = 64 * 32 + 32
|
||||||
trainer = Trainer(model=model)
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2)
|
trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none"))
|
||||||
# Freeze the last layer
|
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2)
|
||||||
for param in model[-1].parameters():
|
# Freeze the last layer
|
||||||
param.requires_grad = False
|
for param in model[-1].parameters():
|
||||||
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1)
|
param.requires_grad = False
|
||||||
|
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1)
|
||||||
|
|
||||||
def test_get_learning_rates(self):
|
def test_get_learning_rates(self):
|
||||||
model = nn.Sequential(nn.Linear(128, 64))
|
model = nn.Sequential(nn.Linear(128, 64))
|
||||||
trainer = Trainer(model=model)
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
with self.assertRaises(ValueError):
|
trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none"))
|
||||||
trainer.get_learning_rates()
|
with self.assertRaises(ValueError):
|
||||||
trainer.create_optimizer()
|
trainer.get_learning_rates()
|
||||||
self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05])
|
trainer.create_optimizer()
|
||||||
|
self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05])
|
||||||
|
|
||||||
def test_get_optimizer_group(self):
|
def test_get_optimizer_group(self):
|
||||||
model = nn.Sequential(nn.Linear(128, 64))
|
model = nn.Sequential(nn.Linear(128, 64))
|
||||||
trainer = Trainer(model=model)
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
# ValueError is raised if optimizer is None
|
trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none"))
|
||||||
with self.assertRaises(ValueError):
|
# ValueError is raised if optimizer is None
|
||||||
trainer.get_optimizer_group()
|
with self.assertRaises(ValueError):
|
||||||
trainer.create_optimizer()
|
trainer.get_optimizer_group()
|
||||||
# Get groups
|
trainer.create_optimizer()
|
||||||
num_groups = len(trainer.get_optimizer_group())
|
# Get groups
|
||||||
self.assertEqual(num_groups, 2)
|
num_groups = len(trainer.get_optimizer_group())
|
||||||
# Get group of parameter
|
self.assertEqual(num_groups, 2)
|
||||||
param = next(model.parameters())
|
# Get group of parameter
|
||||||
group = trainer.get_optimizer_group(param)
|
param = next(model.parameters())
|
||||||
self.assertIn(param, group["params"])
|
group = trainer.get_optimizer_group(param)
|
||||||
|
self.assertIn(param, group["params"])
|
||||||
|
@ -153,7 +153,7 @@ class TestTrainerDistributed(TestCasePlus):
|
|||||||
{self.test_file_dir}/test_trainer_distributed.py
|
{self.test_file_dir}/test_trainer_distributed.py
|
||||||
""".split()
|
""".split()
|
||||||
output_dir = self.get_auto_remove_tmp_dir()
|
output_dir = self.get_auto_remove_tmp_dir()
|
||||||
args = f"--output_dir {output_dir}".split()
|
args = f"--output_dir {output_dir} --report_to none".split()
|
||||||
cmd = ["torchrun"] + distributed_args + args
|
cmd = ["torchrun"] + distributed_args + args
|
||||||
execute_subprocess_async(cmd, env=self.get_env())
|
execute_subprocess_async(cmd, env=self.get_env())
|
||||||
# successful return here == success - any errors would have caused an error in the sub-call
|
# successful return here == success - any errors would have caused an error in the sub-call
|
||||||
|
@ -119,6 +119,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
|||||||
warmup_steps=0,
|
warmup_steps=0,
|
||||||
eval_steps=2,
|
eval_steps=2,
|
||||||
logging_steps=2,
|
logging_steps=2,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
|
|
||||||
# instantiate trainer
|
# instantiate trainer
|
||||||
@ -152,7 +153,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
|||||||
"google-t5/t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5
|
"google-t5/t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5
|
||||||
)
|
)
|
||||||
|
|
||||||
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True)
|
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, report_to="none")
|
||||||
|
|
||||||
trainer = Seq2SeqTrainer(
|
trainer = Seq2SeqTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
@ -160,6 +161,7 @@ class Seq2seqTrainerTester(TestCasePlus):
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
data_collator=data_collator,
|
data_collator=data_collator,
|
||||||
compute_metrics=lambda x: {"samples": x[0].shape[0]},
|
compute_metrics=lambda x: {"samples": x[0].shape[0]},
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_data(examples):
|
def prepare_data(examples):
|
||||||
@ -191,7 +193,9 @@ class Seq2seqTrainerTester(TestCasePlus):
|
|||||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
|
||||||
gen_config = GenerationConfig(do_sample=False, top_p=0.9) # bad: top_p is not compatible with do_sample=False
|
gen_config = GenerationConfig(do_sample=False, top_p=0.9) # bad: top_p is not compatible with do_sample=False
|
||||||
|
|
||||||
training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, generation_config=gen_config)
|
training_args = Seq2SeqTrainingArguments(
|
||||||
|
".", predict_with_generate=True, generation_config=gen_config, report_to="none"
|
||||||
|
)
|
||||||
with self.assertRaises(ValueError) as exc:
|
with self.assertRaises(ValueError) as exc:
|
||||||
_ = Seq2SeqTrainer(
|
_ = Seq2SeqTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
|
Loading…
Reference in New Issue
Block a user