mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Merge branch 'main' into device-map-tp-plan
Some checks failed
Secret Leaks / trufflehog (push) Has been cancelled
Some checks failed
Secret Leaks / trufflehog (push) Has been cancelled
This commit is contained in:
commit
3e2d38efab
6
.github/workflows/model_jobs.yml
vendored
6
.github/workflows/model_jobs.yml
vendored
@ -12,8 +12,8 @@ on:
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
runner_map:
|
||||
required: false
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
@ -45,7 +45,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: '${{ inputs.machine_type }}'
|
||||
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
128
.github/workflows/model_jobs_amd.yml
vendored
128
.github/workflows/model_jobs_amd.yml
vendored
@ -1,128 +0,0 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes
|
||||
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||
# This token is created under the bot `hf-transformers-bot`.
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install -U datasets
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
report_name_prefix:
|
||||
required: false
|
||||
default: run_models_gpu
|
||||
type: string
|
||||
|
||||
env:
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 8
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: ${{ inputs.runner }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ inputs.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all tests on Gaudi
|
||||
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
8
.github/workflows/self-scheduled-caller.yml
vendored
8
.github/workflows/self-scheduled-caller.yml
vendored
@ -22,7 +22,7 @@ on:
|
||||
default: ""
|
||||
|
||||
|
||||
# Used for `push` to easily modiffy the target workflow runs to compare against
|
||||
# Used for `push` to easily modify the target workflow runs to compare against
|
||||
env:
|
||||
prev_workflow_run_id: ""
|
||||
other_workflow_run_id: ""
|
||||
@ -51,7 +51,6 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-models"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -63,7 +62,6 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -75,7 +73,6 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-examples"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -87,7 +84,6 @@ jobs:
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -99,7 +95,6 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
ci_event: Daily CI
|
||||
working-directory-prefix: /workspace
|
||||
@ -112,7 +107,6 @@ jobs:
|
||||
with:
|
||||
job: run_quantization_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-quantization"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
|
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
@ -0,0 +1,345 @@
|
||||
name: Self-hosted runner (scheduled-intel-gaudi)
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
job:
|
||||
required: true
|
||||
type: string
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner_scale_set:
|
||||
required: true
|
||||
type: string
|
||||
ci_event:
|
||||
required: true
|
||||
type: string
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
NUM_SLICES: 2
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- id: set-matrix
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Identify models to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- id: set-matrix-quantization
|
||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||
name: Identify quantization method to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
run_models_gpu:
|
||||
if: ${{ inputs.job == 'run_models_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_models_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_pipelines_gpu:
|
||||
if: ${{ inputs.job == 'run_pipelines_gpu' }}
|
||||
name: Pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all pipeline tests on Intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
|
||||
run_examples_gpu:
|
||||
if: ${{ inputs.job == 'run_examples_gpu' }}
|
||||
name: Examples directory
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run examples tests on Intel Gaudi
|
||||
run: |
|
||||
pip install -r examples/pytorch/_tests_requirements.txt
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
|
||||
run_deepspeed_gpu:
|
||||
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
|
||||
name: Intel Gaudi deepspeed tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all deepspeed tests on intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
|
||||
send_results:
|
||||
name: Slack Report
|
||||
needs:
|
||||
[
|
||||
setup,
|
||||
run_models_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_gpu,
|
||||
run_deepspeed_gpu,
|
||||
run_trainer_and_fsdp_gpu,
|
||||
]
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/workflows/slack-report.yml
|
||||
with:
|
||||
job: ${{ inputs.job }}
|
||||
setup_status: ${{ needs.setup.result }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
|
||||
secrets: inherit
|
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
|
||||
jobs:
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
pipeline-ci:
|
||||
name: Pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_pipelines_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_deepspeed_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
name: Trainer/FSDP CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
secrets: inherit
|
10
.github/workflows/self-scheduled.yml
vendored
10
.github/workflows/self-scheduled.yml
vendored
@ -15,9 +15,6 @@ on:
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
@ -62,6 +59,7 @@ jobs:
|
||||
outputs:
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
|
||||
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Update clone
|
||||
@ -88,6 +86,7 @@ jobs:
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
@ -111,14 +110,14 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs.yml
|
||||
with:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner: ${{ inputs.runner }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
docker: ${{ inputs.docker }}
|
||||
secrets: inherit
|
||||
|
||||
@ -136,7 +135,6 @@ jobs:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner: ${{ inputs.runner }}
|
||||
docker: ${{ inputs.docker }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
secrets: inherit
|
||||
|
@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG TORCH_VISION='0.21.0'
|
||||
ARG TORCH_AUDIO='2.6.0'
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
|
||||
apt clean && \
|
||||
@ -20,6 +23,7 @@ WORKDIR /
|
||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
93
docker/transformers-pytorch-xpu/Dockerfile
Normal file
93
docker/transformers-pytorch-xpu/Dockerfile
Normal file
@ -0,0 +1,93 @@
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG PYTHON_VER=3.11
|
||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get remove -y python3.10 && apt-get autoremove -y
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
|
||||
ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
|
||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get -y install \
|
||||
apt-utils \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
clinfo \
|
||||
curl \
|
||||
git \
|
||||
git-lfs \
|
||||
vim \
|
||||
numactl \
|
||||
gnupg2 \
|
||||
gpg-agent \
|
||||
zlib1g-dev \
|
||||
rsync \
|
||||
sudo \
|
||||
libnl-genl-3-200 \
|
||||
xpu-smi \
|
||||
unzip \
|
||||
ffmpeg \
|
||||
tesseract-ocr \
|
||||
espeak-ng \
|
||||
wget \
|
||||
ncurses-term && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
linux-headers-$(uname -r) \
|
||||
linux-modules-extra-$(uname -r) \
|
||||
flex bison \
|
||||
intel-fw-gpu intel-i915-dkms xpu-smi \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
|
||||
libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install triton==3.3.0
|
||||
|
||||
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
|
||||
|
||||
RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
|
||||
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
|
||||
RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
|
||||
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
|
||||
|
||||
RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
|
||||
|
||||
# install bitsandbytes
|
||||
RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
|
||||
|
||||
ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
|
||||
ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
||||
ENV CCL_ROOT=/usr/local
|
||||
ENV CCL_ATL_TRANSPORT=ofi
|
||||
ENV I_MPI_ROOT=/usr/local
|
||||
ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
|
||||
ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
|
||||
ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
|
||||
|
||||
RUN touch /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
RUN echo "#!/bin/bash" >> /entrypoint.sh
|
||||
RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
@ -468,9 +468,17 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
|
||||
Follow the recommended practices below to ensure your custom decoding method works as expected.
|
||||
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
|
||||
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
|
||||
- You can add other files in the `custom_generate` folder, and use relative imports.
|
||||
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
|
||||
|
||||
Your custom `generate` method can relative import code from the `custom_generate` folder. For example, if you have a `utils.py` file, you can import it like this:
|
||||
|
||||
```py
|
||||
from .utils import some_function
|
||||
```
|
||||
|
||||
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom decoding method.
|
||||
|
||||
|
||||
#### requirements.txt
|
||||
|
||||
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
|
||||
|
@ -14,35 +14,76 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# BLIP
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
# BLIP
|
||||
|
||||
The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/papers/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.
|
||||
|
||||
BLIP is a model that is able to perform various multi-modal tasks including:
|
||||
- Visual Question Answering
|
||||
- Image-Text retrieval (Image-text matching)
|
||||
- Image Captioning
|
||||
|
||||
The abstract from the paper is the following:
|
||||
You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.
|
||||
|
||||
*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks.
|
||||
However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
|
||||
> [!TIP]
|
||||
> This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
|
||||
>
|
||||
> Click on the BLIP models in the right sidebar for more examples of how to apply BLIP to different vision language tasks.
|
||||
|
||||

|
||||
The example below demonstrates how to visual question answering with [`Pipeline`] or the [`AutoModel`] class.
|
||||
|
||||
This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
|
||||
The original code can be found [here](https://github.com/salesforce/BLIP).
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="visual-question-answering",
|
||||
model="Salesforce/blip-vqa-base",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
pipeline(question="What is the weather in this image?", image=url)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
|
||||
|
||||
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
||||
model = AutoModelForVisualQuestionAnswering.from_pretrained(
|
||||
"Salesforce/blip-vqa-base",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
question = "What is the weather in this image?"
|
||||
inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
|
||||
|
||||
output = model.generate(**inputs)
|
||||
processor.batch_decode(output, skip_special_tokens=True)[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Resources
|
||||
|
||||
- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
|
||||
Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) to learn how to fine-tune BLIP for image captioning on a custom dataset.
|
||||
|
||||
## BlipConfig
|
||||
|
||||
|
@ -17,6 +17,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from transformers import ViTMAEForPreTraining, Wav2Vec2ForPreTraining
|
||||
@ -414,6 +415,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_accuracy"], 0.8)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_ctc(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -445,6 +447,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertLess(result["eval_loss"], result["train_loss"])
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_ctc_adapter(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -478,6 +481,7 @@ class ExamplesTests(TestCasePlus):
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
|
||||
self.assertLess(result["eval_loss"], result["train_loss"])
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_seq2seq(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
|
@ -23,7 +23,7 @@ from tqdm import tqdm
|
||||
|
||||
from ...models.bert.tokenization_bert import whitespace_tokenize
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
|
||||
from ...utils import is_tf_available, is_torch_available, logging
|
||||
from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
|
||||
from .utils import DataProcessor
|
||||
|
||||
|
||||
@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
|
||||
is_training=not evaluate,
|
||||
)
|
||||
```"""
|
||||
# Defining helper methods
|
||||
features = []
|
||||
|
||||
threads = min(threads, cpu_count())
|
||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||
if not is_torch_hpu_available():
|
||||
threads = min(threads, cpu_count())
|
||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||
annotate_ = partial(
|
||||
squad_convert_example_to_features,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
padding_strategy=padding_strategy,
|
||||
is_training=is_training,
|
||||
)
|
||||
features = list(
|
||||
tqdm(
|
||||
p.imap(annotate_, examples, chunksize=32),
|
||||
total=len(examples),
|
||||
desc="convert squad examples to features",
|
||||
disable=not tqdm_enabled,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
|
||||
squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
|
||||
annotate_ = partial(
|
||||
squad_convert_example_to_features,
|
||||
max_seq_length=max_seq_length,
|
||||
@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
|
||||
)
|
||||
features = list(
|
||||
tqdm(
|
||||
p.imap(annotate_, examples, chunksize=32),
|
||||
map(annotate_, examples),
|
||||
total=len(examples),
|
||||
desc="convert squad examples to features",
|
||||
disable=not tqdm_enabled,
|
||||
|
@ -402,10 +402,11 @@ def get_cached_module_file(
|
||||
if not (submodule_path / module_file).exists() or not filecmp.cmp(
|
||||
resolved_module_file, str(submodule_path / module_file)
|
||||
):
|
||||
(submodule_path / module_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy(resolved_module_file, submodule_path / module_file)
|
||||
importlib.invalidate_caches()
|
||||
for module_needed in modules_needed:
|
||||
module_needed = f"{module_needed}.py"
|
||||
module_needed = Path(module_file).parent / f"{module_needed}.py"
|
||||
module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
|
||||
if not (submodule_path / module_needed).exists() or not filecmp.cmp(
|
||||
module_needed_file, str(submodule_path / module_needed)
|
||||
|
@ -27,8 +27,6 @@ from ..utils import is_torch_greater_or_equal, logging
|
||||
from ..utils.generic import GeneralInterface
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# Cache this result has it's a C FFI call which can be pretty time-consuming
|
||||
|
@ -172,7 +172,8 @@ _is_quantized = False
|
||||
_is_ds_init_called = False
|
||||
_torch_distributed_available = torch.distributed.is_available()
|
||||
|
||||
if _torch_distributed_available and is_torch_greater_or_equal("2.5"):
|
||||
_is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
|
||||
if _is_dtensor_available:
|
||||
from torch.distributed.tensor import DTensor
|
||||
|
||||
|
||||
@ -3780,7 +3781,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
for shard_file, tensors in filename_to_tensors:
|
||||
shard = {}
|
||||
for tensor in tensors:
|
||||
if isinstance(state_dict[tensor], DTensor):
|
||||
if _is_dtensor_available and isinstance(state_dict[tensor], DTensor):
|
||||
full_tensor = state_dict[tensor].full_tensor()
|
||||
# to get the correctly ordered tensor we need to repack if packed
|
||||
if _get_parameter_tp_plan(tensor, self._tp_plan) in ("local_packed_rowwise",):
|
||||
|
@ -1056,6 +1056,12 @@ class AriaModel(AriaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -1220,10 +1226,10 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -211,6 +211,12 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -389,10 +395,10 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -30,7 +30,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import (
|
||||
LossKwargs,
|
||||
auto_docstring,
|
||||
@ -72,9 +71,6 @@ class ChameleonRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm)
|
||||
|
||||
|
||||
# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon
|
||||
# TODO(joao): add me back asap :)
|
||||
class ChameleonRotaryEmbedding(nn.Module):
|
||||
|
@ -35,7 +35,6 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_rope_utils import dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import LossKwargs, logging
|
||||
from ..llama.modeling_llama import (
|
||||
LlamaAttention,
|
||||
@ -69,9 +68,6 @@ class CohereLayerNorm(nn.Module):
|
||||
return hidden_states.to(input_dtype)
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
|
||||
|
||||
|
||||
class CohereRotaryEmbedding(LlamaRotaryEmbedding):
|
||||
@torch.no_grad()
|
||||
@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
|
||||
|
@ -34,7 +34,6 @@ from ....modeling_outputs import (
|
||||
TokenClassifierOutput,
|
||||
)
|
||||
from ....modeling_utils import PreTrainedModel
|
||||
from ....pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ....utils import (
|
||||
add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
@ -311,10 +310,6 @@ class MegaSequenceNorm(nn.Module):
|
||||
return self.norm(input)
|
||||
|
||||
|
||||
# add this layernorm class to ALL_LAYERNORM_LAYERS
|
||||
ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)
|
||||
|
||||
|
||||
class MegaMultiDimensionDampedEma(nn.Module):
|
||||
"""
|
||||
Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
|
||||
|
@ -1451,6 +1451,12 @@ class Emu3Model(Emu3PreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.text_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.text_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.text_model
|
||||
|
||||
def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
|
||||
"""
|
||||
Tokenizes images into discrete tokens with VQGAN module. Converts
|
||||
@ -1599,10 +1605,10 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
# Make modules available throught conditional class for BC
|
||||
@property
|
||||
|
@ -938,6 +938,12 @@ class Emu3Model(Emu3PreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.text_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.text_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.text_model
|
||||
|
||||
def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor):
|
||||
"""
|
||||
Tokenizes images into discrete tokens with VQGAN module. Converts
|
||||
@ -1086,10 +1092,10 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
# Make modules available throught conditional class for BC
|
||||
@property
|
||||
|
@ -86,6 +86,12 @@ class FuyuModel(FuyuPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def gather_continuous_embeddings(
|
||||
self,
|
||||
word_embeddings: torch.Tensor,
|
||||
|
@ -829,6 +829,12 @@ class Gemma3Model(Gemma3PreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Projects the last hidden state from the vision model into language model space.
|
||||
@ -1014,10 +1020,10 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(self, pixel_values):
|
||||
return self.model.get_image_features(pixel_values)
|
||||
|
@ -637,6 +637,12 @@ class GotOcr2Model(GotOcr2PreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -757,10 +763,10 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -27,7 +27,6 @@ from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, is_torch_flex_attn_available, logging
|
||||
from .configuration_granitemoe import GraniteMoeConfig
|
||||
|
||||
@ -145,9 +144,6 @@ class GraniteMoeRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(GraniteMoeRMSNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
|
||||
class GraniteMoeRotaryEmbedding(nn.Module):
|
||||
def __init__(self, config: GraniteMoeConfig, device=None):
|
||||
|
@ -35,7 +35,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import ModelOutput
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PretrainedConfig, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
||||
from .configuration_idefics import IdeficsConfig
|
||||
from .perceiver import IdeficsPerceiverResampler
|
||||
@ -386,9 +385,6 @@ class IdeficsRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
|
||||
|
||||
|
||||
# this was adapted from LlamaRotaryEmbedding
|
||||
class IdeficsEmbedding(torch.nn.Module):
|
||||
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
||||
|
@ -627,6 +627,12 @@ class InternVLModel(InternVLPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -878,10 +884,10 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -40,7 +40,6 @@ from ...modeling_outputs import (
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import LossKwargs, auto_docstring, can_return_tuple, logging
|
||||
from .configuration_llama import LlamaConfig
|
||||
|
||||
@ -69,9 +68,6 @@ class LlamaRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
|
||||
|
||||
|
||||
class LlamaRotaryEmbedding(nn.Module):
|
||||
def __init__(self, config: LlamaConfig, device=None):
|
||||
super().__init__()
|
||||
|
@ -181,6 +181,12 @@ class LlavaModel(LlavaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -371,10 +377,10 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -294,6 +294,12 @@ class LlavaNextModel(LlavaNextPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
"""
|
||||
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
||||
@ -569,10 +575,10 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
return self.model.pack_image_features(
|
||||
|
@ -348,6 +348,12 @@ class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
"""
|
||||
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
||||
@ -701,10 +707,10 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
return self.model.pack_image_features(
|
||||
|
@ -350,6 +350,12 @@ class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
|
||||
"""
|
||||
Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
|
||||
@ -742,10 +748,10 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
|
||||
return self.model.pack_image_features(
|
||||
|
@ -34,7 +34,7 @@ from ...modeling_outputs import (
|
||||
Seq2SeqModelOutput,
|
||||
)
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
@ -258,8 +258,6 @@ except Exception:
|
||||
logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
|
||||
pass
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5
|
||||
class LongT5DenseActDense(nn.Module):
|
||||
|
@ -248,6 +248,12 @@ class Mistral3Model(Mistral3PreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -407,10 +413,10 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -1641,6 +1641,12 @@ class MllamaModel(MllamaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
@ -1792,10 +1798,10 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
# Make modules available throught conditional class for BC
|
||||
@property
|
||||
|
@ -154,7 +154,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
|
||||
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
|
||||
the cos_sin_cache will be recomputed during the forward pass.
|
||||
"""
|
||||
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
|
||||
super().__init__(dim=dim, base=base, device=device, interleaved=False)
|
||||
self.max_seqlen = max_seqlen
|
||||
|
||||
if max_seqlen is not None and device is not None and dtype is not None:
|
||||
|
@ -417,7 +417,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
|
||||
up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
|
||||
the cos_sin_cache will be recomputed during the forward pass.
|
||||
"""
|
||||
super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
|
||||
super().__init__(dim=dim, base=base, device=device, interleaved=False)
|
||||
self.max_seqlen = max_seqlen
|
||||
|
||||
if max_seqlen is not None and device is not None and dtype is not None:
|
||||
|
@ -31,7 +31,6 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput, Seq2SeqLMOutput
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
|
||||
from ..auto.modeling_auto import AutoModel
|
||||
from .configuration_moshi import MoshiConfig, MoshiDepthConfig
|
||||
@ -234,9 +233,6 @@ class MoshiRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.eps}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(MoshiRMSNorm)
|
||||
|
||||
|
||||
class MoshiFlexibleLinear(nn.Module):
|
||||
def __init__(self, input_size, output_size, num_layers):
|
||||
super().__init__()
|
||||
|
@ -37,7 +37,6 @@ from ...modeling_outputs import (
|
||||
)
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
||||
from .configuration_nemotron import NemotronConfig
|
||||
|
||||
@ -85,9 +84,6 @@ class NemotronLayerNorm1P(nn.LayerNorm):
|
||||
return F.layer_norm(*args)
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(NemotronLayerNorm1P)
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
|
||||
class NemotronRotaryEmbedding(nn.Module):
|
||||
# Ignore copy
|
||||
|
@ -5,7 +5,6 @@ import torch.nn as nn
|
||||
|
||||
from ...cache_utils import Cache
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import logging
|
||||
from ..llama.modeling_llama import LlamaPreTrainedModel, LlamaRMSNorm, eager_attention_forward
|
||||
from ..olmo.configuration_olmo import OlmoConfig
|
||||
@ -176,9 +175,6 @@ class Olmo2RMSNorm(LlamaRMSNorm):
|
||||
return (self.weight * hidden_states).to(input_dtype)
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm)
|
||||
|
||||
|
||||
def rotate_half(x):
|
||||
"""Rotates half the hidden dims of the input."""
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
|
@ -27,7 +27,6 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
|
||||
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, logging
|
||||
from .configuration_olmoe import OlmoeConfig
|
||||
|
||||
@ -142,9 +141,6 @@ class OlmoeRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(OlmoeRMSNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
|
||||
class OlmoeRotaryEmbedding(nn.Module):
|
||||
def __init__(self, config: OlmoeConfig, device=None):
|
||||
|
@ -173,6 +173,12 @@ class PaliGemmaModel(PaliGemmaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask,
|
||||
@ -418,10 +424,10 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(self, pixel_values):
|
||||
return self.model.get_image_features(pixel_values)
|
||||
|
@ -33,7 +33,6 @@ from ...modeling_outputs import (
|
||||
Seq2SeqModelOutput,
|
||||
)
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
@ -96,8 +95,6 @@ except Exception:
|
||||
logger.warning("Discovered apex but it failed to load, falling back to Pix2StructLayerNorm")
|
||||
pass
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(Pix2StructLayerNorm)
|
||||
|
||||
|
||||
class Pix2StructVisionEmbeddings(nn.Module):
|
||||
r"""
|
||||
|
@ -30,7 +30,7 @@ from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import auto_docstring, is_torch_flex_attn_available, is_torch_fx_proxy, is_torchdynamo_compiling, logging
|
||||
from .configuration_pop2piano import Pop2PianoConfig
|
||||
|
||||
@ -88,8 +88,6 @@ class Pop2PianoLayerNorm(nn.Module):
|
||||
if not _load_pop2piano_layer_norm:
|
||||
Pop2PianoLayerNorm = FusedRMSNorm # noqa
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano
|
||||
class Pop2PianoDenseActDense(nn.Module):
|
||||
|
@ -1847,6 +1847,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
|
||||
def get_video_features(
|
||||
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
|
||||
):
|
||||
|
@ -2269,6 +2269,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
|
||||
def get_video_features(
|
||||
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
|
||||
):
|
||||
|
@ -1067,6 +1067,12 @@ class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_rope_index(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@ -1498,10 +1504,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_video_features(
|
||||
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
|
||||
|
@ -1033,6 +1033,12 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_rope_index(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@ -1382,10 +1388,10 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_video_features(
|
||||
self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
|
||||
|
@ -27,7 +27,6 @@ from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, logging
|
||||
from ...utils.import_utils import is_torchdynamo_compiling
|
||||
from .configuration_recurrent_gemma import RecurrentGemmaConfig
|
||||
@ -58,9 +57,6 @@ class RecurrentGemmaRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.eps}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(RecurrentGemmaRMSNorm)
|
||||
|
||||
|
||||
class RecurrentGemmaRotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim, base=10000, device=None):
|
||||
super().__init__()
|
||||
|
@ -34,7 +34,7 @@ from ...modeling_outputs import (
|
||||
Seq2SeqMoEOutput,
|
||||
)
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
@ -240,9 +240,6 @@ class SwitchTransformersLayerNorm(nn.Module):
|
||||
return self.weight * hidden_states
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(SwitchTransformersLayerNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->SwitchTransformers
|
||||
class SwitchTransformersDenseActDense(nn.Module):
|
||||
def __init__(self, config: SwitchTransformersConfig):
|
||||
|
@ -38,7 +38,7 @@ from ...modeling_outputs import (
|
||||
TokenClassifierOutput,
|
||||
)
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
@ -273,8 +273,6 @@ except Exception:
|
||||
logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
|
||||
pass
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
|
||||
|
||||
|
||||
class T5DenseActDense(nn.Module):
|
||||
def __init__(self, config: T5Config):
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
"""Configuration for TimmWrapper models"""
|
||||
|
||||
from typing import Any
|
||||
from typing import Any, Optional
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import is_timm_available, logging, requires_backends
|
||||
@ -45,6 +45,9 @@ class TimmWrapperConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
do_pooling (`bool`, *optional*, defaults to `True`):
|
||||
Whether to do pooling for the last_hidden_state in `TimmWrapperModel` or not.
|
||||
model_args (`dict[str, Any]`, *optional*):
|
||||
Additional keyword arguments to pass to the `timm.create_model` function. e.g. `model_args={"depth": 3}`
|
||||
for `timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k` to create a model with 3 blocks. Defaults to `None`.
|
||||
|
||||
Example:
|
||||
```python
|
||||
@ -60,9 +63,16 @@ class TimmWrapperConfig(PretrainedConfig):
|
||||
|
||||
model_type = "timm_wrapper"
|
||||
|
||||
def __init__(self, initializer_range: float = 0.02, do_pooling: bool = True, **kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
initializer_range: float = 0.02,
|
||||
do_pooling: bool = True,
|
||||
model_args: Optional[dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.initializer_range = initializer_range
|
||||
self.do_pooling = do_pooling
|
||||
self.model_args = model_args # named "model_args" for BC with timm
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
|
@ -116,7 +116,8 @@ class TimmWrapperModel(TimmWrapperPreTrainedModel):
|
||||
def __init__(self, config: TimmWrapperConfig):
|
||||
super().__init__(config)
|
||||
# using num_classes=0 to avoid creating classification head
|
||||
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=0)
|
||||
extra_init_kwargs = config.model_args or {}
|
||||
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=0, **extra_init_kwargs)
|
||||
self.post_init()
|
||||
|
||||
@auto_docstring
|
||||
@ -233,7 +234,10 @@ class TimmWrapperForImageClassification(TimmWrapperPreTrainedModel):
|
||||
"or use `TimmWrapperModel` for feature extraction."
|
||||
)
|
||||
|
||||
self.timm_model = timm.create_model(config.architecture, pretrained=False, num_classes=config.num_labels)
|
||||
extra_init_kwargs = config.model_args or {}
|
||||
self.timm_model = timm.create_model(
|
||||
config.architecture, pretrained=False, num_classes=config.num_labels, **extra_init_kwargs
|
||||
)
|
||||
self.num_labels = config.num_labels
|
||||
self.post_init()
|
||||
|
||||
|
@ -202,6 +202,12 @@ class VideoLlavaModel(VideoLlavaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values_images: torch.FloatTensor,
|
||||
@ -444,10 +450,10 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -182,6 +182,12 @@ class VipLlavaModel(VipLlavaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, list[int]]] = None
|
||||
):
|
||||
@ -327,10 +333,10 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self, pixel_values: torch.FloatTensor, vision_feature_layers: Optional[Union[int, list[int]]] = None
|
||||
|
@ -35,7 +35,6 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from ...utils import auto_docstring, logging
|
||||
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
|
||||
from .configuration_zamba import ZambaConfig
|
||||
@ -81,9 +80,6 @@ class ZambaRMSNorm(nn.Module):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
ALL_LAYERNORM_LAYERS.append(ZambaRMSNorm)
|
||||
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama.repeat_kv
|
||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||
"""
|
||||
|
@ -82,7 +82,9 @@ class AwqQuantizer(HfQuantizer):
|
||||
"your model on a GPU device in order to run your model."
|
||||
)
|
||||
elif device_map is not None:
|
||||
if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
|
||||
if isinstance(device_map, dict) and any(
|
||||
forbidden in device_map.values() for forbidden in ("cpu", torch.device("cpu"), "disk")
|
||||
):
|
||||
raise ValueError(
|
||||
"You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
|
||||
" This is not supported. Please remove the CPU or disk device from the device_map."
|
||||
|
@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):
|
||||
|
||||
def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
|
||||
if device not in dispatch_table:
|
||||
if not callable(dispatch_table["default"]):
|
||||
return dispatch_table["default"]
|
||||
|
||||
return dispatch_table["default"](*args, **kwargs)
|
||||
|
||||
fn = dispatch_table[device]
|
||||
|
@ -73,7 +73,6 @@ from .models.auto.modeling_auto import (
|
||||
from .optimization import Adafactor, get_scheduler
|
||||
from .processing_utils import ProcessorMixin
|
||||
from .pytorch_utils import (
|
||||
ALL_LAYERNORM_LAYERS,
|
||||
is_torch_greater_or_equal_than_2_3,
|
||||
)
|
||||
from .tokenization_utils_base import PreTrainedTokenizerBase
|
||||
@ -1186,9 +1185,10 @@ class Trainer:
|
||||
|
||||
This function filters out parameters in two ways:
|
||||
1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
|
||||
2. By parameter name patterns (containing 'bias', 'layernorm', or 'rmsnorm')
|
||||
2. By parameter name patterns (containing 'bias', or variation of 'norm')
|
||||
"""
|
||||
decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS, ["bias", "layernorm", "rmsnorm"])
|
||||
forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
|
||||
decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
|
||||
return decay_parameters
|
||||
|
||||
def create_optimizer(self):
|
||||
|
@ -21,6 +21,7 @@ import io
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from collections.abc import Iterator, Mapping
|
||||
@ -1124,8 +1125,9 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
|
||||
"""
|
||||
Returns the names of the model parameters that are not inside a forbidden layer.
|
||||
"""
|
||||
if forbidden_layer_names is None:
|
||||
forbidden_layer_names = []
|
||||
forbidden_layer_patterns = (
|
||||
[re.compile(pattern) for pattern in forbidden_layer_names] if forbidden_layer_names is not None else []
|
||||
)
|
||||
result = []
|
||||
for name, child in model.named_children():
|
||||
child_params = get_parameter_names(child, forbidden_layer_types, forbidden_layer_names)
|
||||
@ -1133,12 +1135,15 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
|
||||
f"{name}.{n}"
|
||||
for n in child_params
|
||||
if not isinstance(child, tuple(forbidden_layer_types))
|
||||
and not any(forbidden in f"{name}.{n}".lower() for forbidden in forbidden_layer_names)
|
||||
and not any(pattern.search(f"{name}.{n}".lower()) for pattern in forbidden_layer_patterns)
|
||||
]
|
||||
# Add model specific parameters that are not in any child
|
||||
result += [
|
||||
k for k in model._parameters.keys() if not any(forbidden in k.lower() for forbidden in forbidden_layer_names)
|
||||
k
|
||||
for k in model._parameters.keys()
|
||||
if not any(pattern.search(k.lower()) for pattern in forbidden_layer_patterns)
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
@ -815,8 +815,8 @@ def is_torch_hpu_available():
|
||||
):
|
||||
return False
|
||||
|
||||
torch_hpu_min_version = "1.5.0"
|
||||
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
|
||||
torch_hpu_min_accelerate_version = "1.5.0"
|
||||
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
|
||||
return False
|
||||
|
||||
import torch
|
||||
@ -850,6 +850,24 @@ def is_torch_hpu_available():
|
||||
|
||||
torch.Tensor.masked_fill_ = patched_masked_fill_
|
||||
|
||||
# IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
|
||||
# https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
|
||||
# This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
|
||||
# https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
|
||||
original_compile = torch.compile
|
||||
|
||||
def hpu_backend_compile(*args, **kwargs):
|
||||
if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
|
||||
logger.warning(
|
||||
f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
|
||||
"We will override the backend with 'hpu_backend' to avoid errors."
|
||||
)
|
||||
kwargs["backend"] = "hpu_backend"
|
||||
|
||||
return original_compile(*args, **kwargs)
|
||||
|
||||
torch.compile = hpu_backend_compile
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_basic_distributed(self, stage, dtype):
|
||||
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
||||
|
||||
@require_torch_fp16
|
||||
@run_first
|
||||
def test_do_eval_no_train(self):
|
||||
# testing only zero3 since zero2 makes no sense with inference
|
||||
self.run_and_check(
|
||||
@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
)
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_fp32_non_distributed(self, stage, dtype):
|
||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||
# therefore no quality checks, just basic completion checks are done
|
||||
@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_fp32_distributed(self, stage, dtype):
|
||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||
# therefore no quality checks, just basic completion checks are done
|
||||
@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
)
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
|
||||
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
||||
# the saved model dir
|
||||
@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(["bf16", "fp16", "fp32"])
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_inference(self, dtype):
|
||||
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires bfloat16 hardware support")
|
||||
@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
return output_dir
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_clm(self, stage, dtype):
|
||||
# this test exercises model.resize_token_embeddings() which requires param gathering outside
|
||||
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
|
||||
@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
@require_torch_fp16
|
||||
@run_first
|
||||
def test_clm_from_config_zero3_fp16(self):
|
||||
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
||||
|
||||
|
@ -28,6 +28,7 @@ from transformers.testing_utils import (
|
||||
get_tests_dir,
|
||||
require_deepspeed,
|
||||
require_torch_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
|
||||
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_deepspeed
|
||||
@require_torch_accelerator
|
||||
class TestDeepSpeedModelZoo(TestCasePlus):
|
||||
|
@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
raise AssertionError("CPU offloading failed with FSDP!")
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
@require_fsdp_v2_version
|
||||
@require_accelerate_fsdp2
|
||||
@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
@require_fsdp
|
||||
@require_fsdp_v2_version
|
||||
|
@ -157,6 +157,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
|
||||
self.assertEqual(image_processor.do_reduce_labels, True)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_call_segmentation_maps(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
@ -264,6 +265,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertTrue(encoding["labels"].min().item() >= 0)
|
||||
self.assertTrue(encoding["labels"].max().item() <= 255)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_reduce_labels(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
# Initialize image_processing
|
||||
@ -280,6 +282,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertTrue(encoding["labels"].min().item() >= 0)
|
||||
self.assertTrue(encoding["labels"].max().item() <= 255)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_slow_fast_equivalence(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
@ -475,8 +475,19 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
else:
|
||||
# See PR #38607 (to avoid flakiness)
|
||||
data = torch.flatten(param.data)
|
||||
n_elements = torch.numel(data)
|
||||
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
|
||||
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
|
||||
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
|
||||
data_to_check = torch.sort(data).values
|
||||
if n_elements_to_skip_on_each_side > 0:
|
||||
data_to_check = data_to_check[
|
||||
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
|
||||
]
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
((data_to_check.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
@ -311,8 +311,19 @@ class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
||||
]
|
||||
if param.requires_grad:
|
||||
if any(x in name for x in non_uniform_init_parms):
|
||||
# See PR #38607 (to avoid flakiness)
|
||||
data = torch.flatten(param.data)
|
||||
n_elements = torch.numel(data)
|
||||
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
|
||||
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
|
||||
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
|
||||
data_to_check = torch.sort(data).values
|
||||
if n_elements_to_skip_on_each_side > 0:
|
||||
data_to_check = data_to_check[
|
||||
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
|
||||
]
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
((data_to_check.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
@ -252,8 +252,17 @@ class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unitte
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if param.requires_grad and "register_tokens" not in name:
|
||||
# See PR #38607 (to avoid flakiness)
|
||||
data = torch.flatten(param.data)
|
||||
n_elements = torch.numel(data)
|
||||
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
|
||||
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
|
||||
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
|
||||
data_to_check = torch.sort(data).values
|
||||
if n_elements_to_skip_on_each_side > 0:
|
||||
data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
((data_to_check.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
@ -187,6 +187,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
|
||||
self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
# Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_call_segmentation_maps
|
||||
def test_call_segmentation_maps(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
@ -295,6 +296,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertTrue(encoding["labels"].min().item() >= 0)
|
||||
self.assertTrue(encoding["labels"].max().item() <= 255)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_reduce_labels(self):
|
||||
for image_processing_class in self.image_processor_list:
|
||||
image_processor = image_processing_class(**self.image_processor_dict)
|
||||
@ -317,6 +319,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
# Compare with non-reduced label to see if it's reduced by 1
|
||||
self.assertEqual(encoding["labels"][first_non_zero_coords].item(), first_non_zero_value - 1)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_slow_fast_equivalence(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
@ -338,6 +341,7 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
)
|
||||
self.assertTrue(torch.allclose(image_encoding_slow.labels, image_encoding_fast.labels, atol=1e-1))
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_slow_fast_equivalence_batched(self):
|
||||
if not self.test_slow_image_processor or not self.test_fast_image_processor:
|
||||
self.skipTest(reason="Skipping slow/fast equivalence test")
|
||||
|
@ -103,6 +103,7 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
|
||||
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_LayoutLMv3_integration_test(self):
|
||||
from datasets import load_dataset
|
||||
|
||||
|
@ -135,6 +135,7 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(image_processor.size, {"shortest_edge": 42})
|
||||
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_call_segmentation_maps(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
|
@ -136,6 +136,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_expected_output(self):
|
||||
dummy_image = self.image_processor_tester.prepare_dummy_image()
|
||||
image_processor = self.image_processor
|
||||
@ -185,6 +186,7 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
image = Image.open(filepath).convert("RGB")
|
||||
return np.array(image)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_crop_margin_equality_cv2_python(self):
|
||||
image = self.prepare_dummy_np_image()
|
||||
image_processor = self.image_processor
|
||||
|
@ -544,8 +544,19 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
else:
|
||||
# See PR #38607 (to avoid flakiness)
|
||||
data = torch.flatten(param.data)
|
||||
n_elements = torch.numel(data)
|
||||
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
|
||||
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
|
||||
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
|
||||
data_to_check = torch.sort(data).values
|
||||
if n_elements_to_skip_on_each_side > 0:
|
||||
data_to_check = data_to_check[
|
||||
n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side
|
||||
]
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
((data_to_check.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
@ -138,6 +138,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
|
||||
self.assertEqual(image_processor.do_reduce_labels, True)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_call_segmentation_maps(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
@ -244,6 +245,7 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
||||
self.assertTrue(encoding["labels"].min().item() >= 0)
|
||||
self.assertTrue(encoding["labels"].max().item() <= 255)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_reduce_labels(self):
|
||||
# Initialize image_processing
|
||||
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||
|
@ -249,8 +249,17 @@ class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
||||
if "logit_scale" in name:
|
||||
continue
|
||||
if param.requires_grad:
|
||||
# See PR #38607 (to avoid flakiness)
|
||||
data = torch.flatten(param.data)
|
||||
n_elements = torch.numel(data)
|
||||
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
|
||||
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
|
||||
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
|
||||
data_to_check = torch.sort(data).values
|
||||
if n_elements_to_skip_on_each_side > 0:
|
||||
data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
((data_to_check.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
@ -237,6 +237,24 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
||||
self.assertEqual(config.id2label, restored_config.id2label)
|
||||
self.assertEqual(config.label2id, restored_config.label2id)
|
||||
|
||||
def test_model_init_args(self):
|
||||
# test init from config
|
||||
config = TimmWrapperConfig.from_pretrained(
|
||||
"timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k",
|
||||
model_args={"depth": 3},
|
||||
)
|
||||
model = TimmWrapperModel(config)
|
||||
self.assertEqual(len(model.timm_model.blocks), 3)
|
||||
|
||||
cls_model = TimmWrapperForImageClassification(config)
|
||||
self.assertEqual(len(cls_model.timm_model.blocks), 3)
|
||||
|
||||
# test save load
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
restored_model = TimmWrapperModel.from_pretrained(tmpdirname)
|
||||
self.assertEqual(len(restored_model.timm_model.blocks), 3)
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
|
@ -84,6 +84,7 @@ from transformers.testing_utils import (
|
||||
require_bitsandbytes,
|
||||
require_deepspeed,
|
||||
require_flash_attn,
|
||||
require_non_hpu,
|
||||
require_safetensors,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
@ -92,6 +93,7 @@ from transformers.testing_utils import (
|
||||
require_torch_multi_accelerator,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_sdpa,
|
||||
run_first,
|
||||
run_test_using_subprocess,
|
||||
set_config_for_less_flaky_test,
|
||||
set_model_for_less_flaky_test,
|
||||
@ -2797,6 +2799,7 @@ class ModelTesterMixin:
|
||||
else:
|
||||
torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
|
||||
|
||||
@require_non_hpu
|
||||
@require_accelerate
|
||||
@mark.accelerate_tests
|
||||
@require_torch_multi_accelerator
|
||||
@ -3727,6 +3730,9 @@ class ModelTesterMixin:
|
||||
if torch_device in ["cpu", "cuda"]:
|
||||
atol = atols[torch_device, enable_kernels, torch_dtype]
|
||||
rtol = rtols[torch_device, enable_kernels, torch_dtype]
|
||||
elif torch_device == "hpu":
|
||||
atol = atols["cuda", enable_kernels, torch_dtype]
|
||||
rtol = rtols["cuda", enable_kernels, torch_dtype]
|
||||
elif torch_device == "xpu":
|
||||
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
|
||||
# which is implemented on PyTorch level using aten operators and is
|
||||
@ -3795,6 +3801,10 @@ class ModelTesterMixin:
|
||||
self.skipTest(
|
||||
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
|
||||
)
|
||||
if config.model_type in ["modernbert"]:
|
||||
self.skipTest(
|
||||
reason="ModernBert currently (transformers==4.52.0) automatically adds an attention_mask input"
|
||||
)
|
||||
if config.model_type in ["idefics", "idefics2", "idefics3"]:
|
||||
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
|
||||
if config.model_type in ["sam"]:
|
||||
@ -4662,6 +4672,7 @@ class ModelTesterMixin:
|
||||
|
||||
# Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
|
||||
# may bring unwanted consequences on other tests. See PR #37553
|
||||
@run_first
|
||||
@run_test_using_subprocess
|
||||
@require_torch_accelerator
|
||||
def test_can_load_with_global_device_set(self):
|
||||
|
@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
# the test slower.
|
||||
@require_torch_non_multi_accelerator
|
||||
@run_test_using_subprocess
|
||||
@run_first
|
||||
@slow
|
||||
def test_can_resume_training_lm(self):
|
||||
# Check if it works for a simple language modeling example
|
||||
@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_mrpc(self):
|
||||
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertLess(result["eval_loss"], 0.2)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_multiple(self):
|
||||
MODEL_ID = "openai-community/gpt2"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_non_hpu
|
||||
@require_torch_multi_accelerator
|
||||
def test_end_to_end_example(self):
|
||||
|
@ -22,6 +22,7 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.training_args import ParallelMode
|
||||
@ -116,6 +117,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
class TestTrainerDistributed(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
|
||||
@ -199,8 +201,7 @@ if __name__ == "__main__":
|
||||
model = RegressionModel()
|
||||
training_args.per_device_train_batch_size = 1
|
||||
training_args.max_steps = 1
|
||||
training_args.accelerator_config = {
|
||||
"dispatch_batches": False,
|
||||
}
|
||||
training_args.accelerator_config.dispatch_batches = False
|
||||
|
||||
trainer = Trainer(model, training_args, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
|
@ -18,11 +18,13 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
class TestTrainerDistributedLoss(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
device_count = backend_device_count(torch_device)
|
||||
|
@ -18,6 +18,7 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@ -57,6 +58,7 @@ class DummyModel(nn.Module):
|
||||
|
||||
|
||||
class TestTrainerDistributedWorkerSeed(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
device_count = backend_device_count(torch_device)
|
||||
|
@ -58,6 +58,7 @@ from transformers.testing_utils import (
|
||||
is_staging_test,
|
||||
require_accelerate,
|
||||
require_flax,
|
||||
require_non_hpu,
|
||||
require_read_token,
|
||||
require_safetensors,
|
||||
require_tf,
|
||||
@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):
|
||||
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@require_non_hpu
|
||||
@require_accelerate
|
||||
@mark.accelerate_tests
|
||||
@require_torch_multi_accelerator
|
||||
|
65
utils/get_runner_map.py
Normal file
65
utils/get_runner_map.py
Normal file
@ -0,0 +1,65 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This script is used to get a map containing the information of runners to use in GitHub Actions workflow files.
|
||||
This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners.
|
||||
|
||||
The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json).
|
||||
Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports.
|
||||
We will work on the tests toward to use A10 for all CI jobs.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# T4
|
||||
t4_runners = {
|
||||
"single-gpu": "aws-g4dn-4xlarge-cache",
|
||||
"multi-gpu": "aws-g4dn-12xlarge-cache",
|
||||
}
|
||||
|
||||
# A10
|
||||
a10_runners = {
|
||||
"single-gpu": "aws-g5-4xlarge-cache",
|
||||
"multi-gpu": "aws-g5-12xlarge-cache",
|
||||
}
|
||||
|
||||
tests = os.getcwd()
|
||||
model_tests = os.listdir(os.path.join(tests, "models"))
|
||||
d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
|
||||
d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
|
||||
d1.remove("models")
|
||||
d = d2 + d1
|
||||
|
||||
response = requests.get(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json"
|
||||
)
|
||||
# The models that we want to run with T4 runners
|
||||
jobs_using_t4 = response.json()
|
||||
|
||||
runner_map = {}
|
||||
for key in d:
|
||||
modified_key = key
|
||||
if modified_key.startswith("models/"):
|
||||
modified_key = key[len("models/") :]
|
||||
if modified_key in jobs_using_t4:
|
||||
runner_map[key] = t4_runners
|
||||
else:
|
||||
runner_map[key] = a10_runners
|
||||
|
||||
print(runner_map)
|
@ -21,7 +21,7 @@ import os
|
||||
import sys
|
||||
|
||||
import transformers
|
||||
from transformers import is_torch_xpu_available
|
||||
from transformers import is_torch_hpu_available, is_torch_xpu_available
|
||||
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||
@ -38,6 +38,9 @@ try:
|
||||
accelerator = "CUDA"
|
||||
elif is_torch_xpu_available():
|
||||
accelerator = "XPU"
|
||||
elif is_torch_hpu_available():
|
||||
accelerator = "HPU"
|
||||
|
||||
print("Torch accelerator:", accelerator)
|
||||
|
||||
if accelerator == "CUDA":
|
||||
@ -48,6 +51,9 @@ try:
|
||||
elif accelerator == "XPU":
|
||||
print("SYCL version:", torch.version.xpu)
|
||||
print("Number of XPUs available:", torch.xpu.device_count())
|
||||
elif accelerator == "HPU":
|
||||
print("HPU version:", torch.__version__.split("+")[-1])
|
||||
print("Number of HPUs available:", torch.hpu.device_count())
|
||||
except ImportError:
|
||||
print("Torch version:", None)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user