Merge branch 'main' into device-map-tp-plan
Some checks failed
Secret Leaks / trufflehog (push) Has been cancelled

This commit is contained in:
Marc Sun 2025-06-23 11:28:28 +02:00 committed by GitHub
commit 3e2d38efab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
86 changed files with 1157 additions and 302 deletions

View File

@ -12,8 +12,8 @@ on:
slice_id:
required: true
type: number
runner:
required: true
runner_map:
required: false
type: string
docker:
required: true
@ -45,7 +45,7 @@ jobs:
matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on:
group: '${{ inputs.machine_type }}'
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

View File

@ -1,128 +0,0 @@
name: model jobs
on:
workflow_call:
inputs:
folder_slices:
required: true
type: string
machine_type:
required: true
type: string
slice_id:
required: true
type: number
runner:
required: true
type: string
docker:
required: true
type: string
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
RUN_SLOW: yes
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
# This token is created under the bot `hf-transformers-bot`.
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
TF_FORCE_GPU_ALLOW_GROWTH: true
CUDA_VISIBLE_DEVICES: 0,1
jobs:
run_models_gpu:
name: " "
strategy:
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
fail-fast: false
matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
container:
image: ${{ inputs.docker }}
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Echo input and matrix info
shell: bash
run: |
echo "${{ inputs.folder_slices }}"
echo "${{ matrix.folders }}"
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') }}
working-directory: /transformers
run: |
python3 -m pip install -U datasets
- name: Update / Install some packages (for Past CI)
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
working-directory: /transformers
run: |
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
- name: ROCM-SMI
run: |
rocm-smi
- name: ROCM-INFO
run: |
rocminfo | grep "Agent" -A 14
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: Run test
shell: bash
run: |
mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports

View File

@ -0,0 +1,121 @@
name: model jobs
on:
workflow_call:
inputs:
folder_slices:
required: true
type: string
slice_id:
required: true
type: number
runner:
required: true
type: string
machine_type:
required: true
type: string
report_name_prefix:
required: false
default: run_models_gpu
type: string
env:
RUN_SLOW: yes
PT_HPU_LAZY_MODE: 0
TRANSFORMERS_IS_CI: yes
PT_ENABLE_INT64_SUPPORT: 1
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
HF_HOME: /mnt/cache/.cache/huggingface
jobs:
run_models_gpu:
name: " "
strategy:
max-parallel: 8
fail-fast: false
matrix:
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
runs-on:
group: ${{ inputs.runner }}
container:
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
--env HABANA_VISIBLE_DEVICES
--env HABANA_VISIBLE_MODULES
--cap-add=sys_nice
--shm-size=64G
steps:
- name: Echo input and matrix info
shell: bash
run: |
echo "${{ inputs.folder_slices }}"
echo "${{ matrix.folders }}"
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
- name: Echo folder ${{ matrix.folders }}
shell: bash
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
- name: HL-SMI
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
- name: Environment
run: python3 utils/print_env.py
- name: Show installed libraries and their versions
run: pip freeze
- name: Set `machine_type` for report and artifact names
shell: bash
run: |
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
machine_type=single-gpu
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
machine_type=multi-gpu
else
machine_type=${{ inputs.machine_type }}
fi
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run all tests on Gaudi
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
- name: Run test
shell: bash
run: |
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports

View File

@ -22,7 +22,7 @@ on:
default: ""
# Used for `push` to easily modiffy the target workflow runs to compare against
# Used for `push` to easily modify the target workflow runs to compare against
env:
prev_workflow_run_id: ""
other_workflow_run_id: ""
@ -51,7 +51,6 @@ jobs:
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-daily-models"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
@ -63,7 +62,6 @@ jobs:
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
runner: daily-ci
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
@ -75,7 +73,6 @@ jobs:
with:
job: run_examples_gpu
slack_report_channel: "#transformers-ci-daily-examples"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
@ -87,7 +84,6 @@ jobs:
with:
job: run_trainer_and_fsdp_gpu
slack_report_channel: "#transformers-ci-daily-training"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
@ -99,7 +95,6 @@ jobs:
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-daily-training"
runner: daily-ci
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Daily CI
working-directory-prefix: /workspace
@ -112,7 +107,6 @@ jobs:
with:
job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-daily-quantization"
runner: daily-ci
docker: huggingface/transformers-quantization-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci

View File

@ -0,0 +1,345 @@
name: Self-hosted runner (scheduled-intel-gaudi)
on:
workflow_call:
inputs:
job:
required: true
type: string
slack_report_channel:
required: true
type: string
runner_scale_set:
required: true
type: string
ci_event:
required: true
type: string
report_repo_id:
required: true
type: string
env:
NUM_SLICES: 2
RUN_SLOW: yes
PT_HPU_LAZY_MODE: 0
TRANSFORMERS_IS_CI: yes
PT_ENABLE_INT64_SUPPORT: 1
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
HF_HOME: /mnt/cache/.cache/huggingface
jobs:
setup:
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
name: Setup
runs-on: ubuntu-latest
outputs:
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- id: set-matrix
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
name: Identify models to test
working-directory: tests
run: |
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
fi
- id: set-matrix-quantization
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: Identify quantization method to test
working-directory: tests
run: |
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
run_models_gpu:
if: ${{ inputs.job == 'run_models_gpu' }}
name: " "
needs: setup
strategy:
fail-fast: false
matrix:
machine_type: [1gaudi, 2gaudi]
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
with:
slice_id: ${{ matrix.slice_id }}
machine_type: ${{ matrix.machine_type }}
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_models_gpu
secrets: inherit
run_trainer_and_fsdp_gpu:
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
name: " "
needs: setup
strategy:
fail-fast: false
matrix:
machine_type: [1gaudi, 2gaudi]
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
with:
slice_id: ${{ matrix.slice_id }}
machine_type: ${{ matrix.machine_type }}
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_trainer_and_fsdp_gpu
secrets: inherit
run_pipelines_gpu:
if: ${{ inputs.job == 'run_pipelines_gpu' }}
name: Pipelines
strategy:
fail-fast: false
matrix:
machine_type: [1gaudi, 2gaudi]
runs-on:
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
container:
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
--env HABANA_VISIBLE_DEVICES
--env HABANA_VISIBLE_MODULES
--cap-add=sys_nice
--shm-size=64G
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
- name: HL-SMI
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
- name: Environment
run: python3 utils/print_env.py
- name: Show installed libraries and their versions
run: pip freeze
- name: Set `machine_type` for report and artifact names
shell: bash
run: |
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run all pipeline tests on Intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
name: Examples directory
strategy:
fail-fast: false
matrix:
machine_type: [1gaudi]
runs-on:
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
container:
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
--env HABANA_VISIBLE_DEVICES
--env HABANA_VISIBLE_MODULES
--cap-add=sys_nice
--shm-size=64G
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
- name: HL-SMI
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
- name: Environment
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
run: |
pip freeze
- name: Set `machine_type` for report and artifact names
shell: bash
run: |
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run examples tests on Intel Gaudi
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
run_deepspeed_gpu:
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
name: Intel Gaudi deepspeed tests
strategy:
fail-fast: false
matrix:
machine_type: [1gaudi, 2gaudi]
runs-on:
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
container:
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
--env HABANA_VISIBLE_DEVICES
--env HABANA_VISIBLE_MODULES
--cap-add=sys_nice
--shm-size=64G
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
- name: HL-SMI
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
- name: Environment
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
run: |
pip freeze
- name: Set `machine_type` for report and artifact names
shell: bash
run: |
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run all deepspeed tests on intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
send_results:
name: Slack Report
needs:
[
setup,
run_models_gpu,
run_examples_gpu,
run_pipelines_gpu,
run_deepspeed_gpu,
run_trainer_and_fsdp_gpu,
]
if: ${{ always() }}
uses: ./.github/workflows/slack-report.yml
with:
job: ${{ inputs.job }}
setup_status: ${{ needs.setup.result }}
slack_report_channel: ${{ inputs.slack_report_channel }}
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
folder_slices: ${{ needs.setup.outputs.folder_slices }}
report_repo_id: ${{ inputs.report_repo_id }}
ci_event: ${{ inputs.ci_event }}
secrets: inherit

View File

@ -0,0 +1,67 @@
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
on:
repository_dispatch:
workflow_dispatch:
schedule:
- cron: "17 2 * * *"
jobs:
model-ci:
name: Model CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_models_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
secrets: inherit
pipeline-ci:
name: Pipeline CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_pipelines_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
secrets: inherit
example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_examples_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
secrets: inherit
deepspeed-ci:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_deepspeed_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
secrets: inherit
trainer-fsdp-ci:
name: Trainer/FSDP CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_trainer_and_fsdp_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
secrets: inherit

View File

@ -15,9 +15,6 @@ on:
slack_report_channel:
required: true
type: string
runner:
required: true
type: string
docker:
required: true
type: string
@ -62,6 +59,7 @@ jobs:
outputs:
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
steps:
- name: Update clone
@ -88,6 +86,7 @@ jobs:
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -111,14 +110,14 @@ jobs:
strategy:
fail-fast: false
matrix:
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
machine_type: [single-gpu, multi-gpu]
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
uses: ./.github/workflows/model_jobs.yml
with:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
machine_type: ${{ matrix.machine_type }}
slice_id: ${{ matrix.slice_id }}
runner: ${{ inputs.runner }}
runner_map: ${{ needs.setup.outputs.runner_map }}
docker: ${{ inputs.docker }}
secrets: inherit
@ -136,7 +135,6 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
machine_type: ${{ matrix.machine_type }}
slice_id: ${{ matrix.slice_id }}
runner: ${{ inputs.runner }}
docker: ${{ inputs.docker }}
report_name_prefix: run_trainer_and_fsdp_gpu
secrets: inherit

View File

@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
ARG TORCH_VISION='0.21.0'
ARG TORCH_AUDIO='2.6.0'
RUN apt update && \
apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
apt clean && \
@ -20,6 +23,7 @@ WORKDIR /
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
RUN python3 -m pip uninstall -y tensorflow flax

View File

@ -0,0 +1,93 @@
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
LABEL maintainer="Hugging Face"
SHELL ["/bin/bash", "-c"]
ARG PYTHON_VER=3.11
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get remove -y python3.10 && apt-get autoremove -y
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository -y ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
ln -sf /usr/bin/python3 /usr/bin/python && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN apt-get update && \
apt-get -y install \
apt-utils \
build-essential \
ca-certificates \
clinfo \
curl \
git \
git-lfs \
vim \
numactl \
gnupg2 \
gpg-agent \
zlib1g-dev \
rsync \
sudo \
libnl-genl-3-200 \
xpu-smi \
unzip \
ffmpeg \
tesseract-ocr \
espeak-ng \
wget \
ncurses-term && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN apt-get update && \
apt-get install -y \
linux-headers-$(uname -r) \
linux-modules-extra-$(uname -r) \
flex bison \
intel-fw-gpu intel-i915-dkms xpu-smi \
intel-opencl-icd libze-intel-gpu1 libze1 \
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN pip install --upgrade pip
RUN pip install triton==3.3.0
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
# install bitsandbytes
RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
ENV CCL_ROOT=/usr/local
ENV CCL_ATL_TRANSPORT=ofi
ENV I_MPI_ROOT=/usr/local
ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
RUN touch /entrypoint.sh
RUN chmod +x /entrypoint.sh
RUN echo "#!/bin/bash" >> /entrypoint.sh
RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]

View File

@ -468,9 +468,17 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
Follow the recommended practices below to ensure your custom decoding method works as expected.
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
- You can add other files in the `custom_generate` folder, and use relative imports.
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
Your custom `generate` method can relative import code from the `custom_generate` folder. For example, if you have a `utils.py` file, you can import it like this:
```py
from .utils import some_function
```
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom decoding method.
#### requirements.txt
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.

View File

@ -14,35 +14,76 @@ rendered properly in your Markdown viewer.
-->
# BLIP
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
</div>
</div>
## Overview
# BLIP
The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/papers/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.
BLIP is a model that is able to perform various multi-modal tasks including:
- Visual Question Answering
- Image-Text retrieval (Image-text matching)
- Image Captioning
The abstract from the paper is the following:
You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.
*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks.
However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
> [!TIP]
> This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
>
> Click on the BLIP models in the right sidebar for more examples of how to apply BLIP to different vision language tasks.
![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
The example below demonstrates how to visual question answering with [`Pipeline`] or the [`AutoModel`] class.
This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
The original code can be found [here](https://github.com/salesforce/BLIP).
<hfoptions id="usage">
<hfoption id="Pipeline">
```python
import torch
from transformers import pipeline
pipeline = pipeline(
task="visual-question-answering",
model="Salesforce/blip-vqa-base",
torch_dtype=torch.float16,
device=0
)
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
pipeline(question="What is the weather in this image?", image=url)
```
</hfoption>
<hfoption id="AutoModel">
```python
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = AutoModelForVisualQuestionAnswering.from_pretrained(
"Salesforce/blip-vqa-base",
torch_dtype=torch.float16,
device_map="auto"
)
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
question = "What is the weather in this image?"
inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
output = model.generate(**inputs)
processor.batch_decode(output, skip_special_tokens=True)[0]
```
</hfoption>
</hfoptions>
## Resources
- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) to learn how to fine-tune BLIP for image captioning on a custom dataset.
## BlipConfig

View File

@ -17,6 +17,7 @@ import json
import logging
import os
import sys
import unittest
from unittest.mock import patch
from transformers import ViTMAEForPreTraining, Wav2Vec2ForPreTraining
@ -414,6 +415,7 @@ class ExamplesTests(TestCasePlus):
result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.8)
@unittest.skip("temporary to avoid failing on circleci")
def test_run_speech_recognition_ctc(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
@ -445,6 +447,7 @@ class ExamplesTests(TestCasePlus):
result = get_results(tmp_dir)
self.assertLess(result["eval_loss"], result["train_loss"])
@unittest.skip("temporary to avoid failing on circleci")
def test_run_speech_recognition_ctc_adapter(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
@ -478,6 +481,7 @@ class ExamplesTests(TestCasePlus):
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
self.assertLess(result["eval_loss"], result["train_loss"])
@unittest.skip("temporary to avoid failing on circleci")
def test_run_speech_recognition_seq2seq(self):
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""

View File

@ -23,7 +23,7 @@ from tqdm import tqdm
from ...models.bert.tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import is_tf_available, is_torch_available, logging
from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
from .utils import DataProcessor
@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
is_training=not evaluate,
)
```"""
# Defining helper methods
features = []
threads = min(threads, cpu_count())
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
if not is_torch_hpu_available():
threads = min(threads, cpu_count())
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
annotate_ = partial(
squad_convert_example_to_features,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
padding_strategy=padding_strategy,
is_training=is_training,
)
features = list(
tqdm(
p.imap(annotate_, examples, chunksize=32),
total=len(examples),
desc="convert squad examples to features",
disable=not tqdm_enabled,
)
)
else:
# Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
annotate_ = partial(
squad_convert_example_to_features,
max_seq_length=max_seq_length,
@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
)
features = list(
tqdm(
p.imap(annotate_, examples, chunksize=32),
map(annotate_, examples),
total=len(examples),
desc="convert squad examples to features",
disable=not tqdm_enabled,

View File

@ -402,10 +402,11 @@ def get_cached_module_file(
if not (submodule_path / module_file).exists() or not filecmp.cmp(
resolved_module_file, str(submodule_path / module_file)
):
(submodule_path / module_file).parent.mkdir(parents=True, exist_ok=True)
shutil.copy(resolved_module_file, submodule_path / module_file)
importlib.invalidate_caches()
for module_needed in modules_needed:
module_needed = f"{module_needed}.py"
module_needed = Path(module_file).parent / f"{module_needed}.py"
module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
if not (submodule_path / module_needed).exists() or not filecmp.cmp(
module_needed_file, str(submodule_path / module_needed)

View File

@ -27,8 +27,6 @@ from ..utils import is_torch_greater_or_equal, logging
from ..utils.generic import GeneralInterface
ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
logger = logging.get_logger(__name__)
# Cache this result has it's a C FFI call which can be pretty time-consuming

View File

@ -172,7 +172,8 @@ _is_quantized = False
_is_ds_init_called = False
_torch_distributed_available = torch.distributed.is_available()
if _torch_distributed_available and is_torch_greater_or_equal("2.5"):
_is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
if _is_dtensor_available:
from torch.distributed.tensor import DTensor
@ -3780,7 +3781,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
for shard_file, tensors in filename_to_tensors:
shard = {}
for tensor in tensors:
if isinstance(state_dict[tensor], DTensor):
if _is_dtensor_available and isinstance(state_dict[tensor], DTensor):
full_tensor = state_dict[tensor].full_tensor()
# to get the correctly ordered tensor we need to repack if packed
if _get_parameter_tp_plan(tensor, self._tp_plan) in ("local_packed_rowwise",):

View File

@ -1056,6 +1056,12 @@ class AriaModel(AriaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -1220,10 +1226,10 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -211,6 +211,12 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -389,10 +395,10 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -30,7 +30,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
LossKwargs,
auto_docstring,
@ -72,9 +71,6 @@ class ChameleonRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm)
# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon
# TODO(joao): add me back asap :)
class ChameleonRotaryEmbedding(nn.Module):

View File

@ -35,7 +35,6 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_rope_utils import dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import Unpack
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import LossKwargs, logging
from ..llama.modeling_llama import (
LlamaAttention,
@ -69,9 +68,6 @@ class CohereLayerNorm(nn.Module):
return hidden_states.to(input_dtype)
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
class CohereRotaryEmbedding(LlamaRotaryEmbedding):
@torch.no_grad()
@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)

View File

@ -34,7 +34,6 @@ from ....modeling_outputs import (
TokenClassifierOutput,
)
from ....modeling_utils import PreTrainedModel
from ....pytorch_utils import ALL_LAYERNORM_LAYERS
from ....utils import (
add_code_sample_docstrings,
add_start_docstrings,
@ -311,10 +310,6 @@ class MegaSequenceNorm(nn.Module):
return self.norm(input)
# add this layernorm class to ALL_LAYERNORM_LAYERS
ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)
class MegaMultiDimensionDampedEma(nn.Module):
"""
Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of

View File

@ -1451,6 +1451,12 @@ class Emu3Model(Emu3PreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.text_model = decoder
def get_decoder(self):
return self.text_model
def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
"""
Tokenizes images into discrete tokens with VQGAN module. Converts
@ -1599,10 +1605,10 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
# Make modules available throught conditional class for BC
@property

View File

@ -938,6 +938,12 @@ class Emu3Model(Emu3PreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.text_model = decoder
def get_decoder(self):
return self.text_model
def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
"""
Tokenizes images into discrete tokens with VQGAN module. Converts
@ -1086,10 +1092,10 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
# Make modules available throught conditional class for BC
@property

View File

@ -86,6 +86,12 @@ class FuyuModel(FuyuPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def gather_continuous_embeddings(
self,
word_embeddings: torch.Tensor,

View File

@ -829,6 +829,12 @@ class Gemma3Model(Gemma3PreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
"""
Projects the last hidden state from the vision model into language model space.
@ -1014,10 +1020,10 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)

View File

@ -637,6 +637,12 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -757,10 +763,10 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -27,7 +27,6 @@ from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, is_torch_flex_attn_available, logging
from .configuration_granitemoe import GraniteMoeConfig
@ -145,9 +144,6 @@ class GraniteMoeRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(GraniteMoeRMSNorm)
# Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
class GraniteMoeRotaryEmbedding(nn.Module):
def __init__(self, config: GraniteMoeConfig, device=None):

View File

@ -35,7 +35,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import ModelOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PretrainedConfig, PreTrainedModel
from ...processing_utils import Unpack
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
from .configuration_idefics import IdeficsConfig
from .perceiver import IdeficsPerceiverResampler
@ -386,9 +385,6 @@ class IdeficsRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
# this was adapted from LlamaRotaryEmbedding
class IdeficsEmbedding(torch.nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):

View File

@ -627,6 +627,12 @@ class InternVLModel(InternVLPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -878,10 +884,10 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -40,7 +40,6 @@ from ...modeling_outputs import (
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import LossKwargs, auto_docstring, can_return_tuple, logging
from .configuration_llama import LlamaConfig
@ -69,9 +68,6 @@ class LlamaRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
class LlamaRotaryEmbedding(nn.Module):
def __init__(self, config: LlamaConfig, device=None):
super().__init__()

View File

@ -181,6 +181,12 @@ class LlavaModel(LlavaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -371,10 +377,10 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -294,6 +294,12 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
"""
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
@ -569,10 +575,10 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(

View File

@ -348,6 +348,12 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
"""
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
@ -701,10 +707,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(

View File

@ -350,6 +350,12 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
"""
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
@ -742,10 +748,10 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
return self.model.pack_image_features(

View File

@ -34,7 +34,7 @@ from ...modeling_outputs import (
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
@ -258,8 +258,6 @@ except Exception:
logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
pass
ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5
class LongT5DenseActDense(nn.Module):

View File

@ -248,6 +248,12 @@ class Mistral3Model(Mistral3PreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -407,10 +413,10 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self,

View File

@ -1641,6 +1641,12 @@ class MllamaModel(MllamaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
@can_return_tuple
@auto_docstring
def forward(
@ -1792,10 +1798,10 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
# Make modules available throught conditional class for BC
@property

View File

@ -154,7 +154,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
the cos_sin_cache will be recomputed during the forward pass.
"""
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
super().__init__(dim=dim, base=base, device=device, interleaved=False)
self.max_seqlen = max_seqlen
if max_seqlen is not None and device is not None and dtype is not None:

View File

@ -417,7 +417,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
the cos_sin_cache will be recomputed during the forward pass.
"""
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
super().__init__(dim=dim, base=base, device=device, interleaved=False)
self.max_seqlen = max_seqlen
if max_seqlen is not None and device is not None and dtype is not None:

View File

@ -31,7 +31,6 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput, Seq2SeqLMOutput
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
from ..auto.modeling_auto import AutoModel
from .configuration_moshi import MoshiConfig, MoshiDepthConfig
@ -234,9 +233,6 @@ class MoshiRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.eps}"
ALL_LAYERNORM_LAYERS.append(MoshiRMSNorm)
class MoshiFlexibleLinear(nn.Module):
def __init__(self, input_size, output_size, num_layers):
super().__init__()

View File

@ -37,7 +37,6 @@ from ...modeling_outputs import (
)
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
from .configuration_nemotron import NemotronConfig
@ -85,9 +84,6 @@ class NemotronLayerNorm1P(nn.LayerNorm):
return F.layer_norm(*args)
ALL_LAYERNORM_LAYERS.append(NemotronLayerNorm1P)
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
class NemotronRotaryEmbedding(nn.Module):
# Ignore copy

View File

@ -5,7 +5,6 @@ import torch.nn as nn
from ...cache_utils import Cache
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import logging
from ..llama.modeling_llama import LlamaPreTrainedModel, LlamaRMSNorm, eager_attention_forward
from ..olmo.configuration_olmo import OlmoConfig
@ -176,9 +175,6 @@ class Olmo2RMSNorm(LlamaRMSNorm):
return (self.weight * hidden_states).to(input_dtype)
ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm)
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]

View File

@ -27,7 +27,6 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, logging
from .configuration_olmoe import OlmoeConfig
@ -142,9 +141,6 @@ class OlmoeRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(OlmoeRMSNorm)
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
class OlmoeRotaryEmbedding(nn.Module):
def __init__(self, config: OlmoeConfig, device=None):

View File

@ -173,6 +173,12 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def _update_causal_mask(
self,
attention_mask,
@ -418,10 +424,10 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def get_image_features(self, pixel_values):
return self.model.get_image_features(pixel_values)

View File

@ -33,7 +33,6 @@ from ...modeling_outputs import (
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
@ -96,8 +95,6 @@ except Exception:
logger.warning("Discovered apex but it failed to load, falling back to Pix2StructLayerNorm")
pass
ALL_LAYERNORM_LAYERS.append(Pix2StructLayerNorm)
class Pix2StructVisionEmbeddings(nn.Module):
r"""

View File

@ -30,7 +30,7 @@ from ...generation import GenerationMixin
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import auto_docstring, is_torch_flex_attn_available, is_torch_fx_proxy, is_torchdynamo_compiling, logging
from .configuration_pop2piano import Pop2PianoConfig
@ -88,8 +88,6 @@ class Pop2PianoLayerNorm(nn.Module):
if not _load_pop2piano_layer_norm:
Pop2PianoLayerNorm = FusedRMSNorm # noqa
ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm)
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano
class Pop2PianoDenseActDense(nn.Module):

View File

@ -1847,6 +1847,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
def set_input_embeddings(self, value):
self.model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
):

View File

@ -2269,6 +2269,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
def set_input_embeddings(self, value):
self.model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
):

View File

@ -1067,6 +1067,12 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_rope_index(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1498,10 +1504,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None

View File

@ -1033,6 +1033,12 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_rope_index(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1382,10 +1388,10 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def get_video_features(
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None

View File

@ -27,7 +27,6 @@ from ...generation import GenerationMixin
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, logging
from ...utils.import_utils import is_torchdynamo_compiling
from .configuration_recurrent_gemma import RecurrentGemmaConfig
@ -58,9 +57,6 @@ class RecurrentGemmaRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.eps}"
ALL_LAYERNORM_LAYERS.append(RecurrentGemmaRMSNorm)
class RecurrentGemmaRotaryEmbedding(nn.Module):
def __init__(self, dim, base=10000, device=None):
super().__init__()

View File

@ -34,7 +34,7 @@ from ...modeling_outputs import (
Seq2SeqMoEOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
@ -240,9 +240,6 @@ class SwitchTransformersLayerNorm(nn.Module):
return self.weight * hidden_states
ALL_LAYERNORM_LAYERS.append(SwitchTransformersLayerNorm)
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->SwitchTransformers
class SwitchTransformersDenseActDense(nn.Module):
def __init__(self, config: SwitchTransformersConfig):

View File

@ -38,7 +38,7 @@ from ...modeling_outputs import (
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
DUMMY_INPUTS,
DUMMY_MASK,
@ -273,8 +273,6 @@ except Exception:
logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
pass
ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
class T5DenseActDense(nn.Module):
def __init__(self, config: T5Config):

View File

@ -15,7 +15,7 @@
"""Configuration for TimmWrapper models"""
from typing import Any
from typing import Any, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import is_timm_available, logging, requires_backends
@ -45,6 +45,9 @@ class TimmWrapperConfig(PretrainedConfig):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
do_pooling (`bool`, *optional*, defaults to `True`):
Whether to do pooling for the last_hidden_state in `TimmWrapperModel` or not.
model_args (`dict[str, Any]`, *optional*):
Additional keyword arguments to pass to the `timm.create_model` function. e.g. `model_args={"depth": 3}`
for `timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k` to create a model with 3 blocks. Defaults to `None`.
Example:
```python
@ -60,9 +63,16 @@ class TimmWrapperConfig(PretrainedConfig):
model_type = "timm_wrapper"
def __init__(self, initializer_range: float = 0.02, do_pooling: bool = True, **kwargs):
def __init__(
self,
initializer_range: float = 0.02,
do_pooling: bool = True,
model_args: Optional[dict[str, Any]] = None,
**kwargs,
):
self.initializer_range = initializer_range
self.do_pooling = do_pooling
self.model_args = model_args # named "model_args" for BC with timm
super().__init__(**kwargs)
@classmethod

View File

@ -116,7 +116,8 @@ class TimmWrapperModel(TimmWrapperPreTrainedModel):
def __init__(self, config: TimmWrapperConfig):
super().__init__(config)
# using num_classes=0 to avoid creating classification head
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=0)
extra_init_kwargs = config.model_args or {}
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=0, **extra_init_kwargs)
self.post_init()
@auto_docstring
@ -233,7 +234,10 @@ class TimmWrapperForImageClassification(TimmWrapperPreTrainedModel):
"or use `TimmWrapperModel` for feature extraction."
)
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=config.num_labels)
extra_init_kwargs = config.model_args or {}
self.timm_model = timm.create_model(
config.architecture, pretrained=False, num_classes=config.num_labels, **extra_init_kwargs
)
self.num_labels = config.num_labels
self.post_init()

View File

@ -202,6 +202,12 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self,
pixel_values_images: torch.FloatTensor,
@ -444,10 +450,10 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder()
def get_image_features(
self,

View File

@ -182,6 +182,12 @@ class VipLlavaModel(VipLlavaPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def set_decoder(self, decoder):
self.language_model = decoder
def get_decoder(self):
return self.language_model
def get_image_features(
self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, list[int]]] = None
):
@ -327,10 +333,10 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
self.model.set_decoder(decoder)
def get_decoder(self):
return self.model
return self.model.get_decoder
def get_image_features(
self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, list[int]]] = None

View File

@ -35,7 +35,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import auto_docstring, logging
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
from .configuration_zamba import ZambaConfig
@ -81,9 +80,6 @@ class ZambaRMSNorm(nn.Module):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
ALL_LAYERNORM_LAYERS.append(ZambaRMSNorm)
# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""

View File

@ -82,7 +82,9 @@ class AwqQuantizer(HfQuantizer):
"your model on a GPU device in order to run your model."
)
elif device_map is not None:
if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
if isinstance(device_map, dict) and any(
forbidden in device_map.values() for forbidden in ("cpu", torch.device("cpu"), "disk")
):
raise ValueError(
"You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
" This is not supported. Please remove the CPU or disk device from the device_map."

View File

@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):
def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
if device not in dispatch_table:
if not callable(dispatch_table["default"]):
return dispatch_table["default"]
return dispatch_table["default"](*args, **kwargs)
fn = dispatch_table[device]

View File

@ -73,7 +73,6 @@ from .models.auto.modeling_auto import (
from .optimization import Adafactor, get_scheduler
from .processing_utils import ProcessorMixin
from .pytorch_utils import (
ALL_LAYERNORM_LAYERS,
is_torch_greater_or_equal_than_2_3,
)
from .tokenization_utils_base import PreTrainedTokenizerBase
@ -1186,9 +1185,10 @@ class Trainer:
This function filters out parameters in two ways:
1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
2. By parameter name patterns (containing 'bias', 'layernorm', or 'rmsnorm')
2. By parameter name patterns (containing 'bias', or variation of 'norm')
"""
decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS, ["bias", "layernorm", "rmsnorm"])
forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
return decay_parameters
def create_optimizer(self):

View File

@ -21,6 +21,7 @@ import io
import json
import math
import os
import re
import sys
import warnings
from collections.abc import Iterator, Mapping
@ -1124,8 +1125,9 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
"""
Returns the names of the model parameters that are not inside a forbidden layer.
"""
if forbidden_layer_names is None:
forbidden_layer_names = []
forbidden_layer_patterns = (
[re.compile(pattern) for pattern in forbidden_layer_names] if forbidden_layer_names is not None else []
)
result = []
for name, child in model.named_children():
child_params = get_parameter_names(child, forbidden_layer_types, forbidden_layer_names)
@ -1133,12 +1135,15 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
f"{name}.{n}"
for n in child_params
if not isinstance(child, tuple(forbidden_layer_types))
and not any(forbidden in f"{name}.{n}".lower() for forbidden in forbidden_layer_names)
and not any(pattern.search(f"{name}.{n}".lower()) for pattern in forbidden_layer_patterns)
]
# Add model specific parameters that are not in any child
result += [
k for k in model._parameters.keys() if not any(forbidden in k.lower() for forbidden in forbidden_layer_names)
k
for k in model._parameters.keys()
if not any(pattern.search(k.lower()) for pattern in forbidden_layer_patterns)
]
return result

View File

@ -815,8 +815,8 @@ def is_torch_hpu_available():
):
return False
torch_hpu_min_version = "1.5.0"
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
torch_hpu_min_accelerate_version = "1.5.0"
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
return False
import torch
@ -850,6 +850,24 @@ def is_torch_hpu_available():
torch.Tensor.masked_fill_ = patched_masked_fill_
# IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
# https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
# This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
# https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
original_compile = torch.compile
def hpu_backend_compile(*args, **kwargs):
if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
logger.warning(
f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
"We will override the backend with 'hpu_backend' to avoid errors."
)
kwargs["backend"] = "hpu_backend"
return original_compile(*args, **kwargs)
torch.compile = hpu_backend_compile
return True

View File

@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
@parameterized.expand(params, name_func=parameterized_custom_name_func)
@require_torch_multi_accelerator
@run_first
def test_basic_distributed(self, stage, dtype):
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
@require_torch_fp16
@run_first
def test_do_eval_no_train(self):
# testing only zero3 since zero2 makes no sense with inference
self.run_and_check(
@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
)
@parameterized.expand(params, name_func=parameterized_custom_name_func)
@run_first
def test_fp32_non_distributed(self, stage, dtype):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done
@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
@parameterized.expand(params, name_func=parameterized_custom_name_func)
@require_torch_multi_accelerator
@run_first
def test_fp32_distributed(self, stage, dtype):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done
@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
)
@parameterized.expand(params, name_func=parameterized_custom_name_func)
@run_first
def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
# the saved model dir
@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
@parameterized.expand(["bf16", "fp16", "fp32"])
@require_torch_multi_accelerator
@run_first
def test_inference(self, dtype):
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
self.skipTest(reason="test requires bfloat16 hardware support")
@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
return output_dir
@parameterized.expand(params, name_func=parameterized_custom_name_func)
@run_first
def test_clm(self, stage, dtype):
# this test exercises model.resize_token_embeddings() which requires param gathering outside
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
execute_subprocess_async(cmd, env=self.get_env())
@require_torch_fp16
@run_first
def test_clm_from_config_zero3_fp16(self):
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called

View File

@ -28,6 +28,7 @@ from transformers.testing_utils import (
get_tests_dir,
require_deepspeed,
require_torch_accelerator,
run_first,
slow,
torch_device,
)
@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
@slow
@run_first
@require_deepspeed
@require_torch_accelerator
class TestDeepSpeedModelZoo(TestCasePlus):

View File

@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
raise AssertionError("CPU offloading failed with FSDP!")
@require_torch_multi_accelerator
@run_first
@slow
@require_fsdp_v2_version
@require_accelerate_fsdp2
@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
@require_torch_multi_accelerator
@run_first
@slow
@require_fsdp
@require_fsdp_v2_version

View File

@ -157,6 +157,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
self.assertEqual(image_processor.do_reduce_labels, True)
@unittest.skip("temporary to avoid failing on circleci")
def test_call_segmentation_maps(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
@ -264,6 +265,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
@unittest.skip("temporary to avoid failing on circleci")
def test_reduce_labels(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
@ -280,6 +282,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
@unittest.skip("temporary to avoid failing on circleci")
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")

View File

@ -475,8 +475,19 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
else:
# See PR #38607 (to avoid flakiness)
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
]
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
((data_to_check.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -311,8 +311,19 @@ class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
]
if param.requires_grad:
if any(x in name for x in non_uniform_init_parms):
# See PR #38607 (to avoid flakiness)
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
]
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
((data_to_check.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -252,8 +252,17 @@ class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unitte
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad and "register_tokens" not in name:
# See PR #38607 (to avoid flakiness)
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
((data_to_check.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -187,6 +187,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
@unittest.skip("temporary to avoid failing on circleci")
# Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_call_segmentation_maps
def test_call_segmentation_maps(self):
for image_processing_class in self.image_processor_list:
@ -295,6 +296,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
@unittest.skip("temporary to avoid failing on circleci")
def test_reduce_labels(self):
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class(**self.image_processor_dict)
@ -317,6 +319,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Compare with non-reduced label to see if it's reduced by 1
self.assertEqual(encoding["labels"][first_non_zero_coords].item(), first_non_zero_value - 1)
@unittest.skip("temporary to avoid failing on circleci")
def test_slow_fast_equivalence(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")
@ -338,6 +341,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
)
self.assertTrue(torch.allclose(image_encoding_slow.labels, image_encoding_fast.labels, atol=1e-1))
@unittest.skip("temporary to avoid failing on circleci")
def test_slow_fast_equivalence_batched(self):
if not self.test_slow_image_processor or not self.test_fast_image_processor:
self.skipTest(reason="Skipping slow/fast equivalence test")

View File

@ -103,6 +103,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@unittest.skip("temporary to avoid failing on circleci")
def test_LayoutLMv3_integration_test(self):
from datasets import load_dataset

View File

@ -135,6 +135,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
@unittest.skip("temporary to avoid failing on circleci")
def test_call_segmentation_maps(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)

View File

@ -136,6 +136,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@unittest.skip("temporary to avoid failing on circleci")
def test_expected_output(self):
dummy_image = self.image_processor_tester.prepare_dummy_image()
image_processor = self.image_processor
@ -185,6 +186,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image = Image.open(filepath).convert("RGB")
return np.array(image)
@unittest.skip("temporary to avoid failing on circleci")
def test_crop_margin_equality_cv2_python(self):
image = self.prepare_dummy_np_image()
image_processor = self.image_processor

View File

@ -544,8 +544,19 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
else:
# See PR #38607 (to avoid flakiness)
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
]
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
((data_to_check.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -138,6 +138,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
self.assertEqual(image_processor.do_reduce_labels, True)
@unittest.skip("temporary to avoid failing on circleci")
def test_call_segmentation_maps(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
@ -244,6 +245,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
@unittest.skip("temporary to avoid failing on circleci")
def test_reduce_labels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)

View File

@ -249,8 +249,17 @@ class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
if "logit_scale" in name:
continue
if param.requires_grad:
# See PR #38607 (to avoid flakiness)
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
((data_to_check.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -237,6 +237,24 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
self.assertEqual(config.id2label, restored_config.id2label)
self.assertEqual(config.label2id, restored_config.label2id)
def test_model_init_args(self):
# test init from config
config = TimmWrapperConfig.from_pretrained(
"timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k",
model_args={"depth": 3},
)
model = TimmWrapperModel(config)
self.assertEqual(len(model.timm_model.blocks), 3)
cls_model = TimmWrapperForImageClassification(config)
self.assertEqual(len(cls_model.timm_model.blocks), 3)
# test save load
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
restored_model = TimmWrapperModel.from_pretrained(tmpdirname)
self.assertEqual(len(restored_model.timm_model.blocks), 3)
# We will verify our results on an image of cute cats
def prepare_img():

View File

@ -84,6 +84,7 @@ from transformers.testing_utils import (
require_bitsandbytes,
require_deepspeed,
require_flash_attn,
require_non_hpu,
require_safetensors,
require_torch,
require_torch_accelerator,
@ -92,6 +93,7 @@ from transformers.testing_utils import (
require_torch_multi_accelerator,
require_torch_multi_gpu,
require_torch_sdpa,
run_first,
run_test_using_subprocess,
set_config_for_less_flaky_test,
set_model_for_less_flaky_test,
@ -2797,6 +2799,7 @@ class ModelTesterMixin:
else:
torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
@require_non_hpu
@require_accelerate
@mark.accelerate_tests
@require_torch_multi_accelerator
@ -3727,6 +3730,9 @@ class ModelTesterMixin:
if torch_device in ["cpu", "cuda"]:
atol = atols[torch_device, enable_kernels, torch_dtype]
rtol = rtols[torch_device, enable_kernels, torch_dtype]
elif torch_device == "hpu":
atol = atols["cuda", enable_kernels, torch_dtype]
rtol = rtols["cuda", enable_kernels, torch_dtype]
elif torch_device == "xpu":
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
# which is implemented on PyTorch level using aten operators and is
@ -3795,6 +3801,10 @@ class ModelTesterMixin:
self.skipTest(
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
)
if config.model_type in ["modernbert"]:
self.skipTest(
reason="ModernBert currently (transformers==4.52.0) automatically adds an attention_mask input"
)
if config.model_type in ["idefics", "idefics2", "idefics3"]:
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
if config.model_type in ["sam"]:
@ -4662,6 +4672,7 @@ class ModelTesterMixin:
# Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
# may bring unwanted consequences on other tests. See PR #37553
@run_first
@run_test_using_subprocess
@require_torch_accelerator
def test_can_load_with_global_device_set(self):

View File

@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# the test slower.
@require_torch_non_multi_accelerator
@run_test_using_subprocess
@run_first
@slow
def test_can_resume_training_lm(self):
# Check if it works for a simple language modeling example
@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
)
@slow
@run_first
def test_trainer_eval_mrpc(self):
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertLess(result["eval_loss"], 0.2)
@slow
@run_first
def test_trainer_eval_multiple(self):
MODEL_ID = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
@slow
@run_first
@require_non_hpu
@require_torch_multi_accelerator
def test_end_to_end_example(self):

View File

@ -22,6 +22,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
from transformers.training_args import ParallelMode
@ -116,6 +117,7 @@ if is_torch_available():
class TestTrainerDistributed(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@ -199,8 +201,7 @@ if __name__ == "__main__":
model = RegressionModel()
training_args.per_device_train_batch_size = 1
training_args.max_steps = 1
training_args.accelerator_config = {
"dispatch_batches": False,
}
training_args.accelerator_config.dispatch_batches = False
trainer = Trainer(model, training_args, train_dataset=train_dataset)
trainer.train()

View File

@ -18,11 +18,13 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
class TestTrainerDistributedLoss(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
device_count = backend_device_count(torch_device)

View File

@ -18,6 +18,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
@ -57,6 +58,7 @@ class DummyModel(nn.Module):
class TestTrainerDistributedWorkerSeed(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
device_count = backend_device_count(torch_device)

View File

@ -58,6 +58,7 @@ from transformers.testing_utils import (
is_staging_test,
require_accelerate,
require_flax,
require_non_hpu,
require_read_token,
require_safetensors,
require_tf,
@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):
self.assertIsNotNone(model)
@require_non_hpu
@require_accelerate
@mark.accelerate_tests
@require_torch_multi_accelerator

65
utils/get_runner_map.py Normal file
View File

@ -0,0 +1,65 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is used to get a map containing the information of runners to use in GitHub Actions workflow files.
This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners.
The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json).
Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports.
We will work on the tests toward to use A10 for all CI jobs.
"""
import os
import requests
if __name__ == "__main__":
# T4
t4_runners = {
"single-gpu": "aws-g4dn-4xlarge-cache",
"multi-gpu": "aws-g4dn-12xlarge-cache",
}
# A10
a10_runners = {
"single-gpu": "aws-g5-4xlarge-cache",
"multi-gpu": "aws-g5-12xlarge-cache",
}
tests = os.getcwd()
model_tests = os.listdir(os.path.join(tests, "models"))
d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
d1.remove("models")
d = d2 + d1
response = requests.get(
"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json"
)
# The models that we want to run with T4 runners
jobs_using_t4 = response.json()
runner_map = {}
for key in d:
modified_key = key
if modified_key.startswith("models/"):
modified_key = key[len("models/") :]
if modified_key in jobs_using_t4:
runner_map[key] = t4_runners
else:
runner_map[key] = a10_runners
print(runner_map)

View File

@ -21,7 +21,7 @@ import os
import sys
import transformers
from transformers import is_torch_xpu_available
from transformers import is_torch_hpu_available, is_torch_xpu_available
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@ -38,6 +38,9 @@ try:
accelerator = "CUDA"
elif is_torch_xpu_available():
accelerator = "XPU"
elif is_torch_hpu_available():
accelerator = "HPU"
print("Torch accelerator:", accelerator)
if accelerator == "CUDA":
@ -48,6 +51,9 @@ try:
elif accelerator == "XPU":
print("SYCL version:", torch.version.xpu)
print("Number of XPUs available:", torch.xpu.device_count())
elif accelerator == "HPU":
print("HPU version:", torch.__version__.split("+")[-1])
print("Number of HPUs available:", torch.hpu.device_count())
except ImportError:
print("Torch version:", None)