mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Merge branch 'main' into integrate_xlstm_clean
This commit is contained in:
commit
bd28805865
157
.github/workflows/get-pr-info.yml
vendored
Normal file
157
.github/workflows/get-pr-info.yml
vendored
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
name: Get PR commit SHA
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
pr_number:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
outputs:
|
||||||
|
PR_HEAD_REPO_FULL_NAME:
|
||||||
|
description: "The full name of the repository from which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
|
||||||
|
PR_BASE_REPO_FULL_NAME:
|
||||||
|
description: "The full name of the repository to which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
|
||||||
|
PR_HEAD_REPO_OWNER:
|
||||||
|
description: "The owner of the repository from which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
|
||||||
|
PR_BASE_REPO_OWNER:
|
||||||
|
description: "The owner of the repository to which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
|
||||||
|
PR_HEAD_REPO_NAME:
|
||||||
|
description: "The name of the repository from which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
|
||||||
|
PR_BASE_REPO_NAME:
|
||||||
|
description: "The name of the repository to which the pull request is created"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
|
||||||
|
PR_HEAD_REF:
|
||||||
|
description: "The branch name of the pull request in the head repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
|
||||||
|
PR_BASE_REF:
|
||||||
|
description: "The branch name in the base repository (to merge into)"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
|
||||||
|
PR_HEAD_SHA:
|
||||||
|
description: "The head sha of the pull request branch in the head repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
|
||||||
|
PR_BASE_SHA:
|
||||||
|
description: "The head sha of the target branch in the base repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
|
||||||
|
PR_MERGE_COMMIT_SHA:
|
||||||
|
description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
|
||||||
|
PR_HEAD_COMMIT_DATE:
|
||||||
|
description: "The date of the head sha of the pull request branch in the head repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
|
||||||
|
PR_MERGE_COMMIT_DATE:
|
||||||
|
description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
||||||
|
PR_HEAD_COMMIT_TIMESTAMP:
|
||||||
|
description: "The timestamp of the head sha of the pull request branch in the head repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
|
||||||
|
PR_MERGE_COMMIT_TIMESTAMP:
|
||||||
|
description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
||||||
|
PR:
|
||||||
|
description: "The PR"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR }}
|
||||||
|
PR_FILES:
|
||||||
|
description: "The files touched in the PR"
|
||||||
|
value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
|
||||||
|
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
get-pr-info:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
name: Get PR commit SHA better
|
||||||
|
outputs:
|
||||||
|
PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
|
||||||
|
PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
|
||||||
|
PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
|
||||||
|
PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
|
||||||
|
PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
|
||||||
|
PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
|
||||||
|
PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
|
||||||
|
PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
|
||||||
|
PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
|
||||||
|
PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
|
||||||
|
PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
|
||||||
|
PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
|
||||||
|
PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
|
||||||
|
PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
|
||||||
|
PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
|
||||||
|
PR: ${{ steps.pr_info.outputs.pr }}
|
||||||
|
PR_FILES: ${{ steps.pr_info.outputs.files }}
|
||||||
|
if: ${{ inputs.pr_number != '' }}
|
||||||
|
steps:
|
||||||
|
- name: Extract PR details
|
||||||
|
id: pr_info
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
const { data: pr } = await github.rest.pulls.get({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
pull_number: ${{ inputs.pr_number }}
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data: head_commit } = await github.rest.repos.getCommit({
|
||||||
|
owner: pr.head.repo.owner.login,
|
||||||
|
repo: pr.head.repo.name,
|
||||||
|
ref: pr.head.ref
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data: merge_commit } = await github.rest.repos.getCommit({
|
||||||
|
owner: pr.base.repo.owner.login,
|
||||||
|
repo: pr.base.repo.name,
|
||||||
|
ref: pr.merge_commit_sha,
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data: files } = await github.rest.pulls.listFiles({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
pull_number: ${{ inputs.pr_number }}
|
||||||
|
});
|
||||||
|
|
||||||
|
core.setOutput('head_repo_full_name', pr.head.repo.full_name);
|
||||||
|
core.setOutput('base_repo_full_name', pr.base.repo.full_name);
|
||||||
|
core.setOutput('head_repo_owner', pr.head.repo.owner.login);
|
||||||
|
core.setOutput('base_repo_owner', pr.base.repo.owner.login);
|
||||||
|
core.setOutput('head_repo_name', pr.head.repo.name);
|
||||||
|
core.setOutput('base_repo_name', pr.base.repo.name);
|
||||||
|
core.setOutput('head_ref', pr.head.ref);
|
||||||
|
core.setOutput('base_ref', pr.base.ref);
|
||||||
|
core.setOutput('head_sha', pr.head.sha);
|
||||||
|
core.setOutput('base_sha', pr.base.sha);
|
||||||
|
core.setOutput('merge_commit_sha', pr.merge_commit_sha);
|
||||||
|
core.setOutput('pr', pr);
|
||||||
|
|
||||||
|
core.setOutput('head_commit_date', head_commit.commit.committer.date);
|
||||||
|
core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
|
||||||
|
|
||||||
|
core.setOutput('files', files);
|
||||||
|
|
||||||
|
console.log('PR head commit:', {
|
||||||
|
head_commit: head_commit,
|
||||||
|
commit: head_commit.commit,
|
||||||
|
date: head_commit.commit.committer.date
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('PR merge commit:', {
|
||||||
|
merge_commit: merge_commit,
|
||||||
|
commit: merge_commit.commit,
|
||||||
|
date: merge_commit.commit.committer.date
|
||||||
|
});
|
||||||
|
|
||||||
|
- name: Convert dates to timestamps
|
||||||
|
id: get_timestamps
|
||||||
|
run: |
|
||||||
|
head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
|
||||||
|
merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
|
||||||
|
echo $head_commit_date
|
||||||
|
echo $merge_commit_date
|
||||||
|
head_commit_timestamp=$(date -d "$head_commit_date" +%s)
|
||||||
|
merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
|
||||||
|
echo $head_commit_timestamp
|
||||||
|
echo $merge_commit_timestamp
|
||||||
|
echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
|
||||||
|
echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
|
36
.github/workflows/get-pr-number.yml
vendored
Normal file
36
.github/workflows/get-pr-number.yml
vendored
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
name: Get PR number
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
outputs:
|
||||||
|
PR_NUMBER:
|
||||||
|
description: "The extracted PR number"
|
||||||
|
value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
get-pr-number:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
name: Get PR number
|
||||||
|
outputs:
|
||||||
|
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||||
|
steps:
|
||||||
|
- name: Get PR number
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
|
||||||
|
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
|
||||||
|
elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
|
||||||
|
echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
||||||
|
elif [[ "${{ github.event.pull_request }}" != "" ]]; then
|
||||||
|
echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
|
||||||
|
else
|
||||||
|
echo "PR_NUMBER=" >> $GITHUB_ENV
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Check PR number
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ env.PR_NUMBER }}"
|
||||||
|
|
||||||
|
- name: Set PR number
|
||||||
|
id: set_pr_number
|
||||||
|
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
|
163
.github/workflows/pr_run_slow_ci.yml
vendored
Normal file
163
.github/workflows/pr_run_slow_ci.yml
vendored
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
name: PR slow CI
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
get-pr-number:
|
||||||
|
name: Get PR number
|
||||||
|
uses: ./.github/workflows/get-pr-number.yml
|
||||||
|
|
||||||
|
get-pr-info:
|
||||||
|
name: Get PR commit SHA
|
||||||
|
needs: get-pr-number
|
||||||
|
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||||
|
uses: ./.github/workflows/get-pr-info.yml
|
||||||
|
with:
|
||||||
|
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||||
|
|
||||||
|
# We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
|
||||||
|
verity_pr_commit:
|
||||||
|
name: Verity PR commit corresponds to a specific event by comparing timestamps
|
||||||
|
if: ${{ github.event.comment.created_at != '' }}
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
needs: get-pr-info
|
||||||
|
env:
|
||||||
|
COMMENT_DATE: ${{ github.event.comment.created_at }}
|
||||||
|
PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
||||||
|
PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
||||||
|
steps:
|
||||||
|
- run: |
|
||||||
|
COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
|
||||||
|
echo "COMMENT_DATE: $COMMENT_DATE"
|
||||||
|
echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
|
||||||
|
echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
|
||||||
|
echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
|
||||||
|
if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
|
||||||
|
echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
|
||||||
|
exit -1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
get-jobs:
|
||||||
|
name: Get test files to run
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
needs: [get-pr-number, get-pr-info]
|
||||||
|
outputs:
|
||||||
|
jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
|
||||||
|
steps:
|
||||||
|
- name: Get repository content
|
||||||
|
id: repo_content
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
const { data: tests_dir } = await github.rest.repos.getContent({
|
||||||
|
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||||
|
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||||
|
path: 'tests',
|
||||||
|
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data: tests_models_dir } = await github.rest.repos.getContent({
|
||||||
|
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||||
|
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||||
|
path: 'tests/models',
|
||||||
|
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||||
|
});
|
||||||
|
|
||||||
|
const { data: tests_quantization_dir } = await github.rest.repos.getContent({
|
||||||
|
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||||
|
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||||
|
path: 'tests/quantization',
|
||||||
|
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||||
|
});
|
||||||
|
|
||||||
|
core.setOutput('tests_dir', tests_dir);
|
||||||
|
core.setOutput('tests_models_dir', tests_models_dir);
|
||||||
|
core.setOutput('tests_quantization_dir', tests_quantization_dir);
|
||||||
|
|
||||||
|
# This checkout to the main branch
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: "0"
|
||||||
|
|
||||||
|
- name: Write pr_files file
|
||||||
|
run: |
|
||||||
|
cat > pr_files.txt << 'EOF'
|
||||||
|
${{ needs.get-pr-info.outputs.PR_FILES }}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Write tests_dir file
|
||||||
|
run: |
|
||||||
|
cat > tests_dir.txt << 'EOF'
|
||||||
|
${{ steps.repo_content.outputs.tests_dir }}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Write tests_models_dir file
|
||||||
|
run: |
|
||||||
|
cat > tests_models_dir.txt << 'EOF'
|
||||||
|
${{ steps.repo_content.outputs.tests_models_dir }}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Write tests_quantization_dir file
|
||||||
|
run: |
|
||||||
|
cat > tests_quantization_dir.txt << 'EOF'
|
||||||
|
${{ steps.repo_content.outputs.tests_quantization_dir }}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
- name: Run script to get jobs to run
|
||||||
|
id: get_jobs
|
||||||
|
run: |
|
||||||
|
python utils/get_pr_run_slow_jobs.py | tee output.txt
|
||||||
|
echo "jobs_to_run: $(tail -n 1 output.txt)"
|
||||||
|
echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
send_comment:
|
||||||
|
name: Send a comment to suggest jobs to run
|
||||||
|
if: ${{ needs.get-jobs.outputs.jobs != '' }}
|
||||||
|
needs: [get-pr-number, get-jobs]
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
steps:
|
||||||
|
- name: Delete existing comment and send new one
|
||||||
|
uses: actions/github-script@v7
|
||||||
|
env:
|
||||||
|
BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
|
||||||
|
const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
|
||||||
|
|
||||||
|
// Get all comments on the PR
|
||||||
|
const { data: comments } = await github.rest.issues.listComments({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: prNumber
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find existing comment(s) that start with our prefix
|
||||||
|
const existingComments = comments.filter(comment =>
|
||||||
|
comment.user.login === 'github-actions[bot]' &&
|
||||||
|
comment.body.startsWith(commentPrefix)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Delete existing comment(s)
|
||||||
|
for (const comment of existingComments) {
|
||||||
|
console.log(`Deleting existing comment #${comment.id}`);
|
||||||
|
await github.rest.issues.deleteComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
comment_id: comment.id
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new comment
|
||||||
|
const newBody = `${commentPrefix}${process.env.BODY}`;
|
||||||
|
await github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: prNumber,
|
||||||
|
body: newBody
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('✅ Comment updated successfully');
|
1
.github/workflows/self-scheduled.yml
vendored
1
.github/workflows/self-scheduled.yml
vendored
@ -135,6 +135,7 @@ jobs:
|
|||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
machine_type: ${{ matrix.machine_type }}
|
machine_type: ${{ matrix.machine_type }}
|
||||||
slice_id: ${{ matrix.slice_id }}
|
slice_id: ${{ matrix.slice_id }}
|
||||||
|
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||||
docker: ${{ inputs.docker }}
|
docker: ${{ inputs.docker }}
|
||||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
@ -74,20 +74,16 @@ inputs = processor(
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove Patch Offsets from inputs — only used later for post-processing.
|
|
||||||
patch_offsets = inputs.pop("patch_offsets")
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
|
|
||||||
# Prepare the original image size in the format (height, width)
|
# Prepare the original image size in the format (height, width)
|
||||||
original_image_sizes = [(image.height, image.width)]
|
target_sizes = [(image.height, image.width)]
|
||||||
|
|
||||||
# Post-process the model outputs to get final segmentation prediction
|
# Post-process the model outputs to get final segmentation prediction
|
||||||
preds = processor.post_process_semantic_segmentation(
|
preds = processor.post_process_semantic_segmentation(
|
||||||
outputs,
|
outputs,
|
||||||
patch_offsets=patch_offsets,
|
target_sizes=target_sizes,
|
||||||
original_image_sizes=original_image_sizes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Visualize the segmentation mask
|
# Visualize the segmentation mask
|
||||||
@ -130,12 +126,12 @@ with torch.inference_mode():
|
|||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
|
|
||||||
# Prepare the original image size in the format (height, width)
|
# Prepare the original image size in the format (height, width)
|
||||||
original_image_sizes = [(image.height, image.width)]
|
target_sizes = [(image.height, image.width)]
|
||||||
|
|
||||||
# Post-process the model outputs to get final segmentation prediction
|
# Post-process the model outputs to get final segmentation prediction
|
||||||
preds = processor.post_process_instance_segmentation(
|
preds = processor.post_process_instance_segmentation(
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes=original_image_sizes,
|
target_sizes=target_sizes,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Visualize the segmentation mask
|
# Visualize the segmentation mask
|
||||||
@ -173,12 +169,12 @@ with torch.inference_mode():
|
|||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
|
|
||||||
# Prepare the original image size in the format (height, width)
|
# Prepare the original image size in the format (height, width)
|
||||||
original_image_sizes = [(image.height, image.width)]
|
target_sizes = [(image.height, image.width)]
|
||||||
|
|
||||||
# Post-process the model outputs to get final segmentation prediction
|
# Post-process the model outputs to get final segmentation prediction
|
||||||
preds = processor.post_process_panoptic_segmentation(
|
preds = processor.post_process_panoptic_segmentation(
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes=original_image_sizes,
|
target_sizes=target_sizes,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Visualize the panoptic segmentation mask
|
# Visualize the panoptic segmentation mask
|
||||||
|
@ -29,7 +29,7 @@ rendered properly in your Markdown viewer.
|
|||||||
Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
|
Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
|
||||||
large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
|
large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
|
||||||
this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
|
this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
|
||||||
[MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses
|
[MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
|
||||||
a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
|
a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
|
||||||
every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
|
every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
|
||||||
[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
|
[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
|
||||||
@ -201,4 +201,5 @@ echo -e "Plants create energy through a process known as" | transformers run --t
|
|||||||
[gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
|
[gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
|
||||||
[laurel]: https://arxiv.org/abs/2411.07501
|
[laurel]: https://arxiv.org/abs/2411.07501
|
||||||
[matformer]: https://arxiv.org/abs/2310.07707
|
[matformer]: https://arxiv.org/abs/2310.07707
|
||||||
|
[spark-transformer]: https://arxiv.org/abs/2506.06644
|
||||||
[usm]: https://arxiv.org/abs/2303.01037
|
[usm]: https://arxiv.org/abs/2303.01037
|
||||||
|
@ -34,6 +34,10 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import packaging.version
|
import packaging.version
|
||||||
|
|
||||||
|
|
||||||
|
if os.getenv("WANDB_MODE") == "offline":
|
||||||
|
print("⚙️ Running in WANDB offline mode")
|
||||||
|
|
||||||
from .. import PreTrainedModel, TFPreTrainedModel, TrainingArguments
|
from .. import PreTrainedModel, TFPreTrainedModel, TrainingArguments
|
||||||
from .. import __version__ as version
|
from .. import __version__ as version
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
@ -860,7 +864,7 @@ class WandbCallback(TrainerCallback):
|
|||||||
**init_args,
|
**init_args,
|
||||||
)
|
)
|
||||||
# add config parameters (run may have been created manually)
|
# add config parameters (run may have been created manually)
|
||||||
self._wandb.config.update(combined_dict, allow_val_change=True)
|
self._wandb.config.update(combined_dict or {}, allow_val_change=True)
|
||||||
|
|
||||||
# define default x-axis (for latest wandb versions)
|
# define default x-axis (for latest wandb versions)
|
||||||
if getattr(self._wandb, "define_metric", None):
|
if getattr(self._wandb, "define_metric", None):
|
||||||
|
@ -415,6 +415,7 @@ class Blip2PreTrainedModel(PreTrainedModel):
|
|||||||
_no_split_modules = [
|
_no_split_modules = [
|
||||||
"Blip2Attention",
|
"Blip2Attention",
|
||||||
"Blip2QFormerMultiHeadAttention",
|
"Blip2QFormerMultiHeadAttention",
|
||||||
|
"Blip2EncoderLayer",
|
||||||
"Blip2TextEmbeddings",
|
"Blip2TextEmbeddings",
|
||||||
"T5Block",
|
"T5Block",
|
||||||
"OPTDecoderLayer",
|
"OPTDecoderLayer",
|
||||||
@ -1262,6 +1263,7 @@ class Blip2Model(Blip2PreTrainedModel):
|
|||||||
config_class = Blip2Config
|
config_class = Blip2Config
|
||||||
main_input_name = "pixel_values"
|
main_input_name = "pixel_values"
|
||||||
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
||||||
|
_supports_flash_attn_2 = False # because self.qformer does not support FA2
|
||||||
|
|
||||||
def __init__(self, config: Blip2Config):
|
def __init__(self, config: Blip2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1646,6 +1648,7 @@ class Blip2Model(Blip2PreTrainedModel):
|
|||||||
class Blip2TextModelWithProjection(Blip2PreTrainedModel):
|
class Blip2TextModelWithProjection(Blip2PreTrainedModel):
|
||||||
supports_gradient_checkpointing = False
|
supports_gradient_checkpointing = False
|
||||||
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
||||||
|
_supports_flash_attn_2 = False # because self.qformer does not support FA2
|
||||||
|
|
||||||
def __init__(self, config: Blip2Config):
|
def __init__(self, config: Blip2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1738,6 +1741,7 @@ class Blip2TextModelWithProjection(Blip2PreTrainedModel):
|
|||||||
class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
|
class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
|
||||||
main_input_name = "pixel_values"
|
main_input_name = "pixel_values"
|
||||||
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
||||||
|
_supports_flash_attn_2 = False # because self.qformer does not support FA2
|
||||||
|
|
||||||
def __init__(self, config: Blip2Config):
|
def __init__(self, config: Blip2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1857,6 +1861,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
|||||||
_supports_quantized_cache = False # not all LM bacbones support (e.g. T5)
|
_supports_quantized_cache = False # not all LM bacbones support (e.g. T5)
|
||||||
|
|
||||||
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
||||||
|
_supports_flash_attn_2 = False # because self.qformer does not support FA2
|
||||||
|
|
||||||
def __init__(self, config: Blip2Config):
|
def __init__(self, config: Blip2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -2086,9 +2091,13 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
|||||||
else:
|
else:
|
||||||
special_image_mask = input_ids == self.config.image_token_id
|
special_image_mask = input_ids == self.config.image_token_id
|
||||||
|
|
||||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
special_image_mask = (
|
||||||
language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
|
special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device)
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
|
)
|
||||||
|
language_model_inputs = language_model_inputs.to(inputs_embeds.dtype)
|
||||||
|
inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter(
|
||||||
|
special_image_mask, language_model_inputs
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
||||||
@ -2234,9 +2243,15 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
|||||||
else:
|
else:
|
||||||
special_image_mask = input_ids == self.config.image_token_id
|
special_image_mask = input_ids == self.config.image_token_id
|
||||||
|
|
||||||
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
special_image_mask = (
|
||||||
language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
|
special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(language_model_inputs.device)
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
|
)
|
||||||
|
language_model_inputs = language_model_inputs.to(inputs_embeds.dtype)
|
||||||
|
inputs_embeds = inputs_embeds.to(language_model_inputs.device).masked_scatter(
|
||||||
|
special_image_mask, language_model_inputs
|
||||||
|
)
|
||||||
|
|
||||||
|
attention_mask = attention_mask.to(language_attention_mask.device)
|
||||||
else:
|
else:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
||||||
@ -2259,6 +2274,8 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
|||||||
|
|
||||||
inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
|
inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
|
||||||
if not self.language_model.config.is_encoder_decoder:
|
if not self.language_model.config.is_encoder_decoder:
|
||||||
|
if input_ids is not None:
|
||||||
|
input_ids = input_ids.to(language_model_inputs.device)
|
||||||
inputs["input_ids"] = input_ids
|
inputs["input_ids"] = input_ids
|
||||||
|
|
||||||
outputs = self.language_model.generate(**inputs, **generate_kwargs)
|
outputs = self.language_model.generate(**inputs, **generate_kwargs)
|
||||||
@ -2275,6 +2292,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
|||||||
class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
|
class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
|
||||||
main_input_name = "pixel_values"
|
main_input_name = "pixel_values"
|
||||||
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
_keep_in_fp32_modules = ["query_tokens", "qformer"]
|
||||||
|
_supports_flash_attn_2 = False # because self.qformer does not support FA2
|
||||||
|
|
||||||
def __init__(self, config: Blip2Config):
|
def __init__(self, config: Blip2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -829,6 +829,9 @@ class DabDetrPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
@ -841,6 +844,8 @@ class DabDetrPreTrainedModel(PreTrainedModel):
|
|||||||
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
|
prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
|
||||||
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
||||||
module.class_embed.bias.data.fill_(bias_value)
|
module.class_embed.bias.data.fill_(bias_value)
|
||||||
|
elif isinstance(module, nn.PReLU):
|
||||||
|
module.reset_parameters()
|
||||||
|
|
||||||
|
|
||||||
# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR
|
# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR
|
||||||
|
@ -480,6 +480,12 @@ class DacPreTrainedModel(PreTrainedAudioTokenizerBase):
|
|||||||
if isinstance(module, nn.Conv1d):
|
if isinstance(module, nn.Conv1d):
|
||||||
nn.init.trunc_normal_(module.weight, std=0.02)
|
nn.init.trunc_normal_(module.weight, std=0.02)
|
||||||
nn.init.constant_(module.bias, 0)
|
nn.init.constant_(module.bias, 0)
|
||||||
|
elif isinstance(module, Snake1d):
|
||||||
|
module.alpha.data.fill_(1.0)
|
||||||
|
elif isinstance(module, nn.ConvTranspose1d):
|
||||||
|
module.reset_parameters()
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
module.weight.data.normal_(mean=0.0, std=0.02)
|
||||||
|
|
||||||
def apply_weight_norm(self):
|
def apply_weight_norm(self):
|
||||||
weight_norm = nn.utils.weight_norm
|
weight_norm = nn.utils.weight_norm
|
||||||
|
@ -235,7 +235,7 @@ class EncodecLSTM(nn.Module):
|
|||||||
LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
|
LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, dimension):
|
def __init__(self, config: EncodecConfig, dimension: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
|
self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
|
||||||
|
|
||||||
@ -452,11 +452,7 @@ class EncodecPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
if isinstance(module, nn.Linear):
|
if isinstance(module, nn.GroupNorm):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
|
||||||
if module.bias is not None:
|
|
||||||
module.bias.data.zero_()
|
|
||||||
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, nn.Conv1d):
|
elif isinstance(module, nn.Conv1d):
|
||||||
@ -464,10 +460,8 @@ class EncodecPreTrainedModel(PreTrainedModel):
|
|||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
||||||
nn.init.uniform_(module.bias, a=-k, b=k)
|
nn.init.uniform_(module.bias, a=-k, b=k)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.ConvTranspose1d):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.reset_parameters()
|
||||||
if module.padding_idx is not None:
|
|
||||||
module.weight.data[module.padding_idx].zero_()
|
|
||||||
elif isinstance(module, nn.LSTM):
|
elif isinstance(module, nn.LSTM):
|
||||||
for name, param in module.named_parameters():
|
for name, param in module.named_parameters():
|
||||||
if "weight" in name:
|
if "weight" in name:
|
||||||
@ -659,7 +653,7 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
|
|
||||||
def decode(
|
def decode(
|
||||||
self,
|
self,
|
||||||
audio_codes: torch.Tensor,
|
audio_codes: torch.LongTensor,
|
||||||
audio_scales: torch.Tensor,
|
audio_scales: torch.Tensor,
|
||||||
padding_mask: Optional[torch.Tensor] = None,
|
padding_mask: Optional[torch.Tensor] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
@ -708,10 +702,10 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
@auto_docstring
|
@auto_docstring
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_values: torch.Tensor,
|
input_values: torch.FloatTensor,
|
||||||
padding_mask: Optional[torch.Tensor] = None,
|
padding_mask: Optional[torch.BoolTensor] = None,
|
||||||
bandwidth: Optional[float] = None,
|
bandwidth: Optional[float] = None,
|
||||||
audio_codes: Optional[torch.Tensor] = None,
|
audio_codes: Optional[torch.LongTensor] = None,
|
||||||
audio_scales: Optional[torch.Tensor] = None,
|
audio_scales: Optional[torch.Tensor] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
|
) -> Union[tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
|
||||||
|
@ -97,7 +97,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in
|
|||||||
Computes the output image size given the input image size and the desired output size.
|
Computes the output image size given the input image size and the desired output size.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_size (`Tuple[int, int]`):
|
image_size (`tuple[int, int]`):
|
||||||
The input image size.
|
The input image size.
|
||||||
size (`int`):
|
size (`int`):
|
||||||
The desired output size.
|
The desired output size.
|
||||||
@ -531,13 +531,13 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
Image or batch of images to preprocess.
|
Image or batch of images to preprocess.
|
||||||
segmentation_maps (`ImageInput`, *optional*):
|
segmentation_maps (`ImageInput`, *optional*):
|
||||||
The corresponding semantic segmentation maps with the pixel-wise annotations.
|
The corresponding semantic segmentation maps with the pixel-wise annotations.
|
||||||
instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
|
instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
|
||||||
A mapping between object instance ids and class ids.
|
A mapping between object instance ids and class ids.
|
||||||
do_split_image (`bool`, *optional*, defaults to `self.do_split_image`):
|
do_split_image (`bool`, *optional*, defaults to `self.do_split_image`):
|
||||||
Whether to split the input images into overlapping patches for semantic segmentation.
|
Whether to split the input images into overlapping patches for semantic segmentation.
|
||||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
Whether to resize the input images.
|
Whether to resize the input images.
|
||||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
Target size as a dictionary with `"shortest_edge"` and `"longest_edge"` keys.
|
Target size as a dictionary with `"shortest_edge"` and `"longest_edge"` keys.
|
||||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||||
Resampling filter to use when resizing.
|
Resampling filter to use when resizing.
|
||||||
@ -550,9 +550,9 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
do_pad (`bool`, *optional*, defaults to `False`):
|
do_pad (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
|
||||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
Mean for normalization. Single value or list for each channel.
|
Mean for normalization. Single value or list for each channel.
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||||
Standard deviation for normalization. Single value or list for each channel.
|
Standard deviation for normalization. Single value or list for each channel.
|
||||||
ignore_index (`int`, *optional*):
|
ignore_index (`int`, *optional*):
|
||||||
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
|
||||||
@ -640,7 +640,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if do_split_image and patch_offsets:
|
if do_split_image and patch_offsets:
|
||||||
encoded_inputs["patch_offsets"] = patch_offsets
|
encoded_inputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
@ -663,8 +663,8 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
each mask.
|
each mask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pixel_values_list (`List[ImageInput]`):
|
pixel_values_list (`list[ImageInput]`):
|
||||||
List of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
|
list of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height,
|
||||||
width)`.
|
width)`.
|
||||||
|
|
||||||
segmentation_maps (`ImageInput`, *optional*):
|
segmentation_maps (`ImageInput`, *optional*):
|
||||||
@ -678,7 +678,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
- 1 for pixels that are real (i.e. **not masked**),
|
- 1 for pixels that are real (i.e. **not masked**),
|
||||||
- 0 for pixels that are padding (i.e. **masked**).
|
- 0 for pixels that are padding (i.e. **masked**).
|
||||||
|
|
||||||
instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
|
instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
|
||||||
A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an
|
A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an
|
||||||
instance segmentation map where each pixel represents an instance id. Can be provided as a single
|
instance segmentation map where each pixel represents an instance id. Can be provided as a single
|
||||||
dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map
|
dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map
|
||||||
@ -740,7 +740,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
self,
|
self,
|
||||||
segmentation_logits: torch.Tensor,
|
segmentation_logits: torch.Tensor,
|
||||||
patch_offsets: list[tuple[int, int, int]],
|
patch_offsets: list[tuple[int, int, int]],
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
size: dict[str, int],
|
size: dict[str, int],
|
||||||
) -> list[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
@ -750,28 +750,28 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
segmentation_logits (`torch.Tensor`):
|
segmentation_logits (`torch.Tensor`):
|
||||||
A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
|
A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
|
||||||
for each image patch.
|
for each image patch.
|
||||||
patch_offsets (`List[Tuple[int, int, int]]`):
|
patch_offsets (`list[tuple[int, int, int]]`):
|
||||||
A list of tuples where each tuple contains:
|
A list of tuples where each tuple contains:
|
||||||
- `image_index` (int): Index of the original image this patch belongs to.
|
- `image_index` (int): Index of the original image this patch belongs to.
|
||||||
- `start` (int): Start pixel index of the patch along the long dimension (height or width).
|
- `start` (int): Start pixel index of the patch along the long dimension (height or width).
|
||||||
- `end` (int): End pixel index of the patch along the long dimension.
|
- `end` (int): End pixel index of the patch along the long dimension.
|
||||||
original_image_sizes (`List[Tuple[int, int]]`):
|
target_sizes (`list[tuple[int, int]]`):
|
||||||
List of original (height, width) dimensions for each image before preprocessing.
|
list of original (height, width) dimensions for each image before preprocessing.
|
||||||
size (`Dict[str, int]`):
|
size (`dict[str, int]`):
|
||||||
A size dict which was used to resize.
|
A size dict which was used to resize.
|
||||||
"""
|
"""
|
||||||
num_classes = segmentation_logits.shape[1]
|
num_classes = segmentation_logits.shape[1]
|
||||||
aggregated_logits = []
|
aggregated_logits = []
|
||||||
patch_counts = []
|
patch_counts = []
|
||||||
|
|
||||||
for image_size in original_image_sizes:
|
for image_size in target_sizes:
|
||||||
height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
|
height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
|
||||||
aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
||||||
patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
||||||
|
|
||||||
# Stitch patches back into full-sized logit maps
|
# Stitch patches back into full-sized logit maps
|
||||||
for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
|
for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
|
||||||
if original_image_sizes[image_idx][0] > original_image_sizes[image_idx][1]:
|
if target_sizes[image_idx][0] > target_sizes[image_idx][1]:
|
||||||
aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
|
aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
|
||||||
patch_counts[image_idx][:, patch_start:patch_end, :] += 1
|
patch_counts[image_idx][:, patch_start:patch_end, :] += 1
|
||||||
else:
|
else:
|
||||||
@ -784,7 +784,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
averaged_logits = logit_sum / count.clamp(min=1)
|
averaged_logits = logit_sum / count.clamp(min=1)
|
||||||
resized_logits = F.interpolate(
|
resized_logits = F.interpolate(
|
||||||
averaged_logits[None, ...],
|
averaged_logits[None, ...],
|
||||||
size=original_image_sizes[idx],
|
size=target_sizes[idx],
|
||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
align_corners=False,
|
align_corners=False,
|
||||||
)[0]
|
)[0]
|
||||||
@ -796,14 +796,14 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
def unpad_image(
|
def unpad_image(
|
||||||
self,
|
self,
|
||||||
segmentation_logits: torch.Tensor,
|
segmentation_logits: torch.Tensor,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
size: dict[str, int],
|
size: dict[str, int],
|
||||||
) -> list[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
"""Restores panoptic segmentation logits to their original image resolutions."""
|
"""Restores panoptic segmentation logits to their original image resolutions."""
|
||||||
|
|
||||||
resized_logits = []
|
resized_logits = []
|
||||||
|
|
||||||
for idx, original_size in enumerate(original_image_sizes):
|
for idx, original_size in enumerate(target_sizes):
|
||||||
target_height, target_width = get_size_with_aspect_ratio(
|
target_height, target_width = get_size_with_aspect_ratio(
|
||||||
original_size, size["shortest_edge"], size["longest_edge"]
|
original_size, size["shortest_edge"], size["longest_edge"]
|
||||||
)
|
)
|
||||||
@ -817,8 +817,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
def post_process_semantic_segmentation(
|
def post_process_semantic_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
patch_offsets: list[tuple[int, int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
original_image_sizes: list[tuple[int, int]],
|
|
||||||
size: Optional[dict[str, int]] = None,
|
size: Optional[dict[str, int]] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Post-processes model outputs into final semantic segmentation prediction."""
|
"""Post-processes model outputs into final semantic segmentation prediction."""
|
||||||
@ -827,6 +826,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
||||||
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
||||||
|
patch_offsets = outputs.patch_offsets
|
||||||
|
|
||||||
output_size = get_target_size(size)
|
output_size = get_target_size(size)
|
||||||
masks_queries_logits = F.interpolate(
|
masks_queries_logits = F.interpolate(
|
||||||
@ -841,15 +841,15 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
||||||
|
|
||||||
output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, original_image_sizes, size)
|
output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size)
|
||||||
|
|
||||||
preds = torch.stack(output_logits).argmax(dim=1)
|
preds = [logit.argmax(dim=0) for logit in output_logits]
|
||||||
return preds
|
return preds
|
||||||
|
|
||||||
def post_process_panoptic_segmentation(
|
def post_process_panoptic_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
threshold: float = 0.8,
|
threshold: float = 0.8,
|
||||||
mask_threshold: float = 0.5,
|
mask_threshold: float = 0.5,
|
||||||
overlap_mask_area_threshold: float = 0.8,
|
overlap_mask_area_threshold: float = 0.8,
|
||||||
@ -873,7 +873,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
)
|
)
|
||||||
|
|
||||||
mask_probs_batch = self.unpad_image(masks_queries_logits, original_image_sizes, size)
|
mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
|
||||||
pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
|
pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
|
||||||
|
|
||||||
results: list = []
|
results: list = []
|
||||||
@ -885,7 +885,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
|
|
||||||
# No mask found
|
# No mask found
|
||||||
if mask_probs.shape[0] <= 0:
|
if mask_probs.shape[0] <= 0:
|
||||||
height, width = original_image_sizes[i] if original_image_sizes is not None else mask_probs.shape[1:]
|
height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:]
|
||||||
segmentation = torch.zeros((height, width)) - 1
|
segmentation = torch.zeros((height, width)) - 1
|
||||||
results.append({"segmentation": segmentation, "segments_info": []})
|
results.append({"segmentation": segmentation, "segments_info": []})
|
||||||
continue
|
continue
|
||||||
@ -897,16 +897,17 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
stuff_classes=stuff_classes,
|
stuff_classes=stuff_classes,
|
||||||
mask_threshold=mask_threshold,
|
mask_threshold=mask_threshold,
|
||||||
overlap_mask_area_threshold=overlap_mask_area_threshold,
|
overlap_mask_area_threshold=overlap_mask_area_threshold,
|
||||||
target_size=original_image_sizes[i] if original_image_sizes is not None else None,
|
target_size=target_sizes[i] if target_sizes is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": segments})
|
results.append({"segmentation": segmentation, "segments_info": segments})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@filter_out_non_signature_kwargs()
|
||||||
def post_process_instance_segmentation(
|
def post_process_instance_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
threshold: float = 0.5,
|
threshold: float = 0.5,
|
||||||
size: Optional[dict[str, int]] = None,
|
size: Optional[dict[str, int]] = None,
|
||||||
):
|
):
|
||||||
@ -924,7 +925,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
)
|
)
|
||||||
|
|
||||||
mask_probs_batch = self.unpad_image(masks_queries_logits, original_image_sizes, size)
|
mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
|
||||||
|
|
||||||
device = masks_queries_logits.device
|
device = masks_queries_logits.device
|
||||||
batch_size = class_queries_logits.shape[0]
|
batch_size = class_queries_logits.shape[0]
|
||||||
@ -946,7 +947,7 @@ class EomtImageProcessor(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
pred_scores = scores * mask_scores
|
pred_scores = scores * mask_scores
|
||||||
|
|
||||||
segmentation = torch.zeros(original_image_sizes[i], device=device) - 1
|
segmentation = torch.zeros(target_sizes[i], device=device) - 1
|
||||||
|
|
||||||
instance_maps, segments = [], []
|
instance_maps, segments = [], []
|
||||||
current_segment_id = 0
|
current_segment_id = 0
|
||||||
|
@ -41,6 +41,7 @@ from ...processing_utils import Unpack
|
|||||||
from ...utils import (
|
from ...utils import (
|
||||||
TensorType,
|
TensorType,
|
||||||
auto_docstring,
|
auto_docstring,
|
||||||
|
filter_out_non_signature_kwargs,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torchvision_available,
|
is_torchvision_available,
|
||||||
is_torchvision_v2_available,
|
is_torchvision_v2_available,
|
||||||
@ -268,7 +269,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
r"""
|
r"""
|
||||||
segmentation_maps (`ImageInput`, *optional*):
|
segmentation_maps (`ImageInput`, *optional*):
|
||||||
The segmentation maps to preprocess for corresponding images.
|
The segmentation maps to preprocess for corresponding images.
|
||||||
instance_id_to_semantic_id (`List[Dict[int, int]]` or `Dict[int, int]`, *optional*):
|
instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*):
|
||||||
A mapping between object instance ids and class ids.
|
A mapping between object instance ids and class ids.
|
||||||
"""
|
"""
|
||||||
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
|
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
|
||||||
@ -340,7 +341,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
outputs["class_labels"] = class_labels
|
outputs["class_labels"] = class_labels
|
||||||
|
|
||||||
if patch_offsets:
|
if patch_offsets:
|
||||||
outputs["patch_offsets"] = patch_offsets
|
outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
@ -348,7 +349,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
self,
|
self,
|
||||||
segmentation_logits: torch.Tensor,
|
segmentation_logits: torch.Tensor,
|
||||||
patch_offsets: list[tuple[int, int, int]],
|
patch_offsets: list[tuple[int, int, int]],
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
size: dict[str, int],
|
size: dict[str, int],
|
||||||
) -> list[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
@ -358,28 +359,28 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
segmentation_logits (`torch.Tensor`):
|
segmentation_logits (`torch.Tensor`):
|
||||||
A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
|
A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits
|
||||||
for each image patch.
|
for each image patch.
|
||||||
patch_offsets (`List[Tuple[int, int, int]]`):
|
patch_offsets (`list[tuple[int, int, int]]`):
|
||||||
A list of tuples where each tuple contains:
|
A list of tuples where each tuple contains:
|
||||||
- `image_index` (int): Index of the original image this patch belongs to.
|
- `image_index` (int): Index of the original image this patch belongs to.
|
||||||
- `start` (int): Start pixel index of the patch along the long dimension (height or width).
|
- `start` (int): Start pixel index of the patch along the long dimension (height or width).
|
||||||
- `end` (int): End pixel index of the patch along the long dimension.
|
- `end` (int): End pixel index of the patch along the long dimension.
|
||||||
original_image_sizes (`List[Tuple[int, int]]`):
|
target_sizes (`list[tuple[int, int]]`):
|
||||||
List of original (height, width) dimensions for each image before preprocessing.
|
list of original (height, width) dimensions for each image before preprocessing.
|
||||||
size (`Dict[str, int]`):
|
size (`dict[str, int]`):
|
||||||
A size dict which was used to resize.
|
A size dict which was used to resize.
|
||||||
"""
|
"""
|
||||||
num_classes = segmentation_logits.shape[1]
|
num_classes = segmentation_logits.shape[1]
|
||||||
aggregated_logits = []
|
aggregated_logits = []
|
||||||
patch_counts = []
|
patch_counts = []
|
||||||
|
|
||||||
for image_size in original_image_sizes:
|
for image_size in target_sizes:
|
||||||
height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
|
height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"])
|
||||||
aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
||||||
patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device))
|
||||||
|
|
||||||
# Stitch patches back into full-sized logit maps
|
# Stitch patches back into full-sized logit maps
|
||||||
for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
|
for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets):
|
||||||
if original_image_sizes[image_idx][0] > original_image_sizes[image_idx][1]:
|
if target_sizes[image_idx][0] > target_sizes[image_idx][1]:
|
||||||
aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
|
aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx]
|
||||||
patch_counts[image_idx][:, patch_start:patch_end, :] += 1
|
patch_counts[image_idx][:, patch_start:patch_end, :] += 1
|
||||||
else:
|
else:
|
||||||
@ -392,7 +393,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
averaged_logits = logit_sum / count.clamp(min=1)
|
averaged_logits = logit_sum / count.clamp(min=1)
|
||||||
resized_logits = torch.nn.functional.interpolate(
|
resized_logits = torch.nn.functional.interpolate(
|
||||||
averaged_logits[None, ...],
|
averaged_logits[None, ...],
|
||||||
size=original_image_sizes[idx],
|
size=target_sizes[idx],
|
||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
align_corners=False,
|
align_corners=False,
|
||||||
)[0]
|
)[0]
|
||||||
@ -404,14 +405,14 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
def unpad_image(
|
def unpad_image(
|
||||||
self,
|
self,
|
||||||
segmentation_logits: torch.Tensor,
|
segmentation_logits: torch.Tensor,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
size: dict[str, int],
|
size: dict[str, int],
|
||||||
) -> list[torch.Tensor]:
|
) -> list[torch.Tensor]:
|
||||||
"""Restores panoptic segmentation logits to their original image resolutions."""
|
"""Restores panoptic segmentation logits to their original image resolutions."""
|
||||||
|
|
||||||
resized_logits = []
|
resized_logits = []
|
||||||
|
|
||||||
for idx, original_size in enumerate(original_image_sizes):
|
for idx, original_size in enumerate(target_sizes):
|
||||||
target_height, target_width = get_size_with_aspect_ratio(
|
target_height, target_width = get_size_with_aspect_ratio(
|
||||||
original_size, size["shortest_edge"], size["longest_edge"]
|
original_size, size["shortest_edge"], size["longest_edge"]
|
||||||
)
|
)
|
||||||
@ -425,8 +426,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
def post_process_semantic_segmentation(
|
def post_process_semantic_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
patch_offsets: list[tuple[int, int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
original_image_sizes: list[tuple[int, int]],
|
|
||||||
size: Optional[dict[str, int]] = None,
|
size: Optional[dict[str, int]] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Post-processes model outputs into final semantic segmentation prediction."""
|
"""Post-processes model outputs into final semantic segmentation prediction."""
|
||||||
@ -435,6 +435,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width]
|
||||||
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1]
|
||||||
|
patch_offsets = outputs.patch_offsets
|
||||||
|
|
||||||
output_size = get_target_size(size)
|
output_size = get_target_size(size)
|
||||||
masks_queries_logits = torch.nn.functional.interpolate(
|
masks_queries_logits = torch.nn.functional.interpolate(
|
||||||
@ -449,15 +450,15 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
|
||||||
|
|
||||||
output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, original_image_sizes, size)
|
output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size)
|
||||||
|
|
||||||
preds = torch.stack(output_logits).argmax(dim=1)
|
preds = [logit.argmax(dim=0) for logit in output_logits]
|
||||||
return preds
|
return preds
|
||||||
|
|
||||||
def post_process_panoptic_segmentation(
|
def post_process_panoptic_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
threshold: float = 0.8,
|
threshold: float = 0.8,
|
||||||
mask_threshold: float = 0.5,
|
mask_threshold: float = 0.5,
|
||||||
overlap_mask_area_threshold: float = 0.8,
|
overlap_mask_area_threshold: float = 0.8,
|
||||||
@ -481,7 +482,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
)
|
)
|
||||||
|
|
||||||
mask_probs_batch = self.unpad_image(masks_queries_logits, original_image_sizes, size)
|
mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
|
||||||
pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
|
pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1)
|
||||||
|
|
||||||
results: list = []
|
results: list = []
|
||||||
@ -493,7 +494,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
|
|
||||||
# No mask found
|
# No mask found
|
||||||
if mask_probs.shape[0] <= 0:
|
if mask_probs.shape[0] <= 0:
|
||||||
height, width = original_image_sizes[i] if original_image_sizes is not None else mask_probs.shape[1:]
|
height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:]
|
||||||
segmentation = torch.zeros((height, width)) - 1
|
segmentation = torch.zeros((height, width)) - 1
|
||||||
results.append({"segmentation": segmentation, "segments_info": []})
|
results.append({"segmentation": segmentation, "segments_info": []})
|
||||||
continue
|
continue
|
||||||
@ -505,16 +506,17 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
stuff_classes=stuff_classes,
|
stuff_classes=stuff_classes,
|
||||||
mask_threshold=mask_threshold,
|
mask_threshold=mask_threshold,
|
||||||
overlap_mask_area_threshold=overlap_mask_area_threshold,
|
overlap_mask_area_threshold=overlap_mask_area_threshold,
|
||||||
target_size=original_image_sizes[i] if original_image_sizes is not None else None,
|
target_size=target_sizes[i] if target_sizes is not None else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
results.append({"segmentation": segmentation, "segments_info": segments})
|
results.append({"segmentation": segmentation, "segments_info": segments})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@filter_out_non_signature_kwargs()
|
||||||
def post_process_instance_segmentation(
|
def post_process_instance_segmentation(
|
||||||
self,
|
self,
|
||||||
outputs,
|
outputs,
|
||||||
original_image_sizes: list[tuple[int, int]],
|
target_sizes: list[tuple[int, int]],
|
||||||
threshold: float = 0.8,
|
threshold: float = 0.8,
|
||||||
size: Optional[dict[str, int]] = None,
|
size: Optional[dict[str, int]] = None,
|
||||||
):
|
):
|
||||||
@ -532,7 +534,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
mode="bilinear",
|
mode="bilinear",
|
||||||
)
|
)
|
||||||
|
|
||||||
mask_probs_batch = self.unpad_image(masks_queries_logits, original_image_sizes, size)
|
mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size)
|
||||||
|
|
||||||
device = masks_queries_logits.device
|
device = masks_queries_logits.device
|
||||||
batch_size = class_queries_logits.shape[0]
|
batch_size = class_queries_logits.shape[0]
|
||||||
@ -554,7 +556,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
|
|||||||
)
|
)
|
||||||
pred_scores = scores * mask_scores
|
pred_scores = scores * mask_scores
|
||||||
|
|
||||||
segmentation = torch.zeros(original_image_sizes[i], device=device) - 1
|
segmentation = torch.zeros(target_sizes[i], device=device) - 1
|
||||||
|
|
||||||
instance_maps, segments = [], []
|
instance_maps, segments = [], []
|
||||||
current_segment_id = 0
|
current_segment_id = 0
|
||||||
|
@ -74,6 +74,8 @@ class EomtForUniversalSegmentationOutput(ModelOutput):
|
|||||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||||
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
||||||
|
patch_offsets (`list[torch.Tensor]`, *optional*):
|
||||||
|
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@ -82,6 +84,7 @@ class EomtForUniversalSegmentationOutput(ModelOutput):
|
|||||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||||
|
patch_offsets: Optional[list[torch.Tensor]] = None
|
||||||
|
|
||||||
|
|
||||||
# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
|
# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
|
||||||
@ -996,7 +999,7 @@ class EomtPreTrainedModel(PreTrainedModel):
|
|||||||
base_model_prefix = "eomt"
|
base_model_prefix = "eomt"
|
||||||
main_input_name = "pixel_values"
|
main_input_name = "pixel_values"
|
||||||
supports_gradient_checkpointing = False
|
supports_gradient_checkpointing = False
|
||||||
_no_split_modules = ["EomtMLP"]
|
_no_split_modules = ["EomtLayer"]
|
||||||
_supports_sdpa = True
|
_supports_sdpa = True
|
||||||
_supports_flash_attn_2 = True
|
_supports_flash_attn_2 = True
|
||||||
|
|
||||||
@ -1097,13 +1100,16 @@ class EomtForUniversalSegmentation(EomtPreTrainedModel):
|
|||||||
class_labels: Optional[list[Tensor]] = None,
|
class_labels: Optional[list[Tensor]] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
|
patch_offsets: Optional[list[Tensor]] = None,
|
||||||
) -> EomtForUniversalSegmentationOutput:
|
) -> EomtForUniversalSegmentationOutput:
|
||||||
r"""
|
r"""
|
||||||
mask_labels (`List[torch.Tensor]`, *optional*):
|
mask_labels (`list[torch.Tensor]`, *optional*):
|
||||||
List of mask labels of shape `(num_labels, height, width)` to be fed to a model
|
list of mask labels of shape `(num_labels, height, width)` to be fed to a model
|
||||||
class_labels (`List[torch.LongTensor]`, *optional*):
|
class_labels (`list[torch.LongTensor]`, *optional*):
|
||||||
list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
|
list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
|
||||||
labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
|
labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
|
||||||
|
patch_offsets (`list[torch.Tensor]`, *optional*):
|
||||||
|
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
|
||||||
"""
|
"""
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
@ -1126,7 +1132,7 @@ class EomtForUniversalSegmentation(EomtPreTrainedModel):
|
|||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
if idx == self.num_hidden_layers - self.config.num_blocks:
|
if idx == self.num_hidden_layers - self.config.num_blocks:
|
||||||
query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1)
|
query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
|
||||||
hidden_states = torch.cat((query, hidden_states), dim=1)
|
hidden_states = torch.cat((query, hidden_states), dim=1)
|
||||||
|
|
||||||
if idx >= self.num_hidden_layers - self.config.num_blocks and (
|
if idx >= self.num_hidden_layers - self.config.num_blocks and (
|
||||||
@ -1206,6 +1212,7 @@ class EomtForUniversalSegmentation(EomtPreTrainedModel):
|
|||||||
last_hidden_state=sequence_output,
|
last_hidden_state=sequence_output,
|
||||||
hidden_states=all_hidden_states,
|
hidden_states=all_hidden_states,
|
||||||
attentions=all_attentions,
|
attentions=all_attentions,
|
||||||
|
patch_offsets=patch_offsets,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_input_embeddings(self):
|
def get_input_embeddings(self):
|
||||||
|
@ -226,6 +226,8 @@ class EomtForUniversalSegmentationOutput(ModelOutput):
|
|||||||
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||||
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||||
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
sequence_length)`. Self and Cross Attentions weights from transformer decoder.
|
||||||
|
patch_offsets (`list[torch.Tensor]`, *optional*):
|
||||||
|
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
loss: Optional[torch.FloatTensor] = None
|
||||||
@ -234,6 +236,7 @@ class EomtForUniversalSegmentationOutput(ModelOutput):
|
|||||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||||
|
patch_offsets: Optional[list[torch.Tensor]] = None
|
||||||
|
|
||||||
|
|
||||||
class EomtLoss(Mask2FormerLoss):
|
class EomtLoss(Mask2FormerLoss):
|
||||||
@ -368,7 +371,7 @@ class EomtPreTrainedModel(PreTrainedModel):
|
|||||||
base_model_prefix = "eomt"
|
base_model_prefix = "eomt"
|
||||||
main_input_name = "pixel_values"
|
main_input_name = "pixel_values"
|
||||||
supports_gradient_checkpointing = False
|
supports_gradient_checkpointing = False
|
||||||
_no_split_modules = ["EomtMLP"]
|
_no_split_modules = ["EomtLayer"]
|
||||||
_supports_sdpa = True
|
_supports_sdpa = True
|
||||||
_supports_flash_attn_2 = True
|
_supports_flash_attn_2 = True
|
||||||
|
|
||||||
@ -473,13 +476,16 @@ class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation, nn.Modul
|
|||||||
class_labels: Optional[list[Tensor]] = None,
|
class_labels: Optional[list[Tensor]] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
|
patch_offsets: Optional[list[Tensor]] = None,
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
mask_labels (`List[torch.Tensor]`, *optional*):
|
mask_labels (`list[torch.Tensor]`, *optional*):
|
||||||
List of mask labels of shape `(num_labels, height, width)` to be fed to a model
|
list of mask labels of shape `(num_labels, height, width)` to be fed to a model
|
||||||
class_labels (`List[torch.LongTensor]`, *optional*):
|
class_labels (`list[torch.LongTensor]`, *optional*):
|
||||||
list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
|
list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
|
||||||
labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
|
labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
|
||||||
|
patch_offsets (`list[torch.Tensor]`, *optional*):
|
||||||
|
list of tuples indicating the image index and start and end positions of patches for semantic segementation.
|
||||||
"""
|
"""
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
@ -502,7 +508,7 @@ class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation, nn.Modul
|
|||||||
all_hidden_states += (hidden_states,)
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
if idx == self.num_hidden_layers - self.config.num_blocks:
|
if idx == self.num_hidden_layers - self.config.num_blocks:
|
||||||
query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1)
|
query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
|
||||||
hidden_states = torch.cat((query, hidden_states), dim=1)
|
hidden_states = torch.cat((query, hidden_states), dim=1)
|
||||||
|
|
||||||
if idx >= self.num_hidden_layers - self.config.num_blocks and (
|
if idx >= self.num_hidden_layers - self.config.num_blocks and (
|
||||||
@ -582,6 +588,7 @@ class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation, nn.Modul
|
|||||||
last_hidden_state=sequence_output,
|
last_hidden_state=sequence_output,
|
||||||
hidden_states=all_hidden_states,
|
hidden_states=all_hidden_states,
|
||||||
attentions=all_attentions,
|
attentions=all_attentions,
|
||||||
|
patch_offsets=patch_offsets,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -445,9 +445,16 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, FalconMambaMixer):
|
if isinstance(module, FalconMambaMixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
|
||||||
|
A = A.expand(module.intermediate_size, -1).contiguous()
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
||||||
if self.config.time_step_init_scheme == "constant":
|
if self.config.time_step_init_scheme == "constant":
|
||||||
@ -462,33 +469,39 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
|
|||||||
).clamp(min=self.config.time_step_floor)
|
).clamp(min=self.config.time_step_floor)
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
module.dt_proj.bias.copy_(inv_dt)
|
||||||
module.dt_proj.bias.copy_(inv_dt)
|
|
||||||
module.dt_proj.bias._no_reinit = True
|
module.dt_proj.bias._no_reinit = True
|
||||||
|
|
||||||
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
|
if module.conv1d.bias is not None:
|
||||||
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
|
|
||||||
|
if self.config.rescale_prenorm_residual:
|
||||||
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
|
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
||||||
|
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
||||||
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
|
#
|
||||||
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
|
# We need to reinit p since this code could be called multiple times
|
||||||
|
# Having just p *= scale would repeatedly scale it down
|
||||||
|
p = module.out_proj.weight
|
||||||
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, FalconMambaRMSNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
|
||||||
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
|
||||||
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
|
||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
|
||||||
#
|
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
|
||||||
# We need to reinit p since this code could be called multiple times
|
|
||||||
# Having just p *= scale would repeatedly scale it down
|
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -1414,16 +1414,18 @@ class GroundingDinoPreTrainedModel(PreTrainedModel):
|
|||||||
module.out_vision_proj.bias.data.fill_(0)
|
module.out_vision_proj.bias.data.fill_(0)
|
||||||
nn.init.xavier_uniform_(module.out_text_proj.weight)
|
nn.init.xavier_uniform_(module.out_text_proj.weight)
|
||||||
module.out_text_proj.bias.data.fill_(0)
|
module.out_text_proj.bias.data.fill_(0)
|
||||||
elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)):
|
elif isinstance(module, GroundingDinoFusionLayer):
|
||||||
for p in module.parameters():
|
module.vision_param.data.fill_(1e-4)
|
||||||
if p.dim() > 1:
|
module.text_param.data.fill_(1e-4)
|
||||||
nn.init.normal_(p, mean=0.0, std=std)
|
|
||||||
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
||||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
@ -176,7 +176,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
|
|||||||
patch_size=14,
|
patch_size=14,
|
||||||
image_size=384,
|
image_size=384,
|
||||||
num_hidden_layers=26,
|
num_hidden_layers=26,
|
||||||
num_attention_heads=14,
|
num_attention_heads=16,
|
||||||
vision_use_head=False,
|
vision_use_head=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -382,9 +382,16 @@ class MambaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, MambaMixer):
|
if isinstance(module, MambaMixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
|
||||||
|
A = A.expand(module.intermediate_size, -1).contiguous()
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
|
||||||
if self.config.time_step_init_scheme == "constant":
|
if self.config.time_step_init_scheme == "constant":
|
||||||
@ -399,33 +406,39 @@ class MambaPreTrainedModel(PreTrainedModel):
|
|||||||
).clamp(min=self.config.time_step_floor)
|
).clamp(min=self.config.time_step_floor)
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
module.dt_proj.bias.copy_(inv_dt)
|
||||||
module.dt_proj.bias.copy_(inv_dt)
|
|
||||||
module.dt_proj.bias._no_reinit = True
|
module.dt_proj.bias._no_reinit = True
|
||||||
|
|
||||||
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
|
if module.conv1d.bias is not None:
|
||||||
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
|
|
||||||
|
if self.config.rescale_prenorm_residual:
|
||||||
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
|
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
||||||
|
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
||||||
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
|
#
|
||||||
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
|
# We need to reinit p since this code could be called multiple times
|
||||||
|
# Having just p *= scale would repeatedly scale it down
|
||||||
|
p = module.out_proj.weight
|
||||||
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, MambaRMSNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
|
||||||
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
|
||||||
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
|
||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
|
||||||
#
|
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
|
||||||
# We need to reinit p since this code could be called multiple times
|
|
||||||
# Having just p *= scale would repeatedly scale it down
|
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -721,9 +721,15 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
|
std = self.config.initializer_range
|
||||||
if isinstance(module, Mamba2Mixer):
|
if isinstance(module, Mamba2Mixer):
|
||||||
|
# S4D real initialization. These are not discretized!
|
||||||
|
# The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
|
||||||
|
A = torch.arange(1, self.config.num_heads + 1)
|
||||||
|
module.A_log.copy_(torch.log(A))
|
||||||
module.A_log._no_weight_decay = True
|
module.A_log._no_weight_decay = True
|
||||||
module.D._no_weight_decay = True
|
module.D._no_weight_decay = True
|
||||||
|
module.D.data.fill_(1.0)
|
||||||
|
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
torch.rand(self.config.num_heads)
|
torch.rand(self.config.num_heads)
|
||||||
@ -733,33 +739,39 @@ class Mamba2PreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
# # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
|
||||||
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
with torch.no_grad():
|
module.dt_bias.copy_(inv_dt)
|
||||||
module.dt_bias.copy_(inv_dt)
|
|
||||||
module.dt_bias._no_reinit = True
|
module.dt_bias._no_reinit = True
|
||||||
|
|
||||||
|
nn.init.kaiming_uniform_(module.conv1d.weight, a=math.sqrt(5))
|
||||||
|
if module.conv1d.bias is not None:
|
||||||
|
if not getattr(module.conv1d.bias, "_no_reinit", False):
|
||||||
|
nn.init.zeros_(module.conv1d.bias)
|
||||||
|
nn.init.kaiming_uniform_(module.out_proj.weight, a=math.sqrt(5))
|
||||||
|
|
||||||
|
if self.config.rescale_prenorm_residual:
|
||||||
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
||||||
|
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
||||||
|
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
||||||
|
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
||||||
|
#
|
||||||
|
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
||||||
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
||||||
|
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
||||||
|
# We need to reinit p since this code could be called multiple times
|
||||||
|
# Having just p *= scale would repeatedly scale it down
|
||||||
|
p = module.out_proj.weight
|
||||||
|
p /= math.sqrt(self.config.num_hidden_layers)
|
||||||
|
|
||||||
if isinstance(module, nn.Linear):
|
if isinstance(module, nn.Linear):
|
||||||
|
if not getattr(module.weight, "_no_reinit", False):
|
||||||
|
nn.init.normal_(module.weight, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
if not getattr(module.bias, "_no_reinit", False):
|
if not getattr(module.bias, "_no_reinit", False):
|
||||||
nn.init.zeros_(module.bias)
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, (Mamba2RMSNorm, MambaRMSNormGated)):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
nn.init.normal_(module.weight, std=self.config.initializer_range)
|
nn.init.normal_(module.weight, std=std)
|
||||||
|
|
||||||
if self.config.rescale_prenorm_residual:
|
|
||||||
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
|
||||||
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
|
||||||
# > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
|
|
||||||
# > -- GPT-2 :: https://openai.com/blog/better-language-models/
|
|
||||||
#
|
|
||||||
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name in ["out_proj.weight"]:
|
|
||||||
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
|
||||||
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
|
||||||
# We need to reinit p since this code could be called multiple times
|
|
||||||
# Having just p *= scale would repeatedly scale it down
|
|
||||||
nn.init.kaiming_uniform_(p, a=math.sqrt(5))
|
|
||||||
with torch.no_grad():
|
|
||||||
p /= math.sqrt(self.config.num_hidden_layers)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -147,7 +147,7 @@ class MusicgenSinusoidalPositionalEmbedding(nn.Module):
|
|||||||
position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
|
position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
|
||||||
# expand embeddings if needed
|
# expand embeddings if needed
|
||||||
if seq_len > self.weights.size(0):
|
if seq_len > self.weights.size(0):
|
||||||
self.make_weights(seq_len + self.offset, self.embedding_dim)
|
self.make_weights(seq_len, self.embedding_dim)
|
||||||
return self.weights.index_select(0, position_ids.view(-1)).detach()
|
return self.weights.index_select(0, position_ids.view(-1)).detach()
|
||||||
|
|
||||||
|
|
||||||
@ -440,10 +440,13 @@ class MusicgenPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
std = self.config.initializer_factor
|
std = self.config.initializer_factor
|
||||||
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
if isinstance(module, nn.Linear):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
@ -154,7 +154,7 @@ class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
|
|||||||
position_ids = (torch.arange(seq_len) + past_key_values_length).to(inputs_embeds.device)
|
position_ids = (torch.arange(seq_len) + past_key_values_length).to(inputs_embeds.device)
|
||||||
# expand embeddings if needed
|
# expand embeddings if needed
|
||||||
if seq_len > self.weights.size(0):
|
if seq_len > self.weights.size(0):
|
||||||
self.make_weights(seq_len + self.offset, self.embedding_dim)
|
self.make_weights(seq_len, self.embedding_dim)
|
||||||
return self.weights.index_select(0, position_ids.view(-1)).detach()
|
return self.weights.index_select(0, position_ids.view(-1)).detach()
|
||||||
|
|
||||||
|
|
||||||
@ -406,10 +406,13 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
std = self.config.initializer_factor
|
std = self.config.initializer_factor
|
||||||
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
if isinstance(module, nn.Linear):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
@ -1286,7 +1289,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
|
|||||||
The text encoder model that encodes text into hidden states for conditioning.
|
The text encoder model that encodes text into hidden states for conditioning.
|
||||||
audio_encoder (`PreTrainedModel`, *optional*):
|
audio_encoder (`PreTrainedModel`, *optional*):
|
||||||
The audio encoder model that encodes audio into hidden states for conditioning.
|
The audio encoder model that encodes audio into hidden states for conditioning.
|
||||||
decoder (`MusicgenForCausalLM`, *optional*):
|
decoder (`MusicgenMelodyForCausalLM`, *optional*):
|
||||||
The decoder model that generates audio tokens based on conditioning signals.
|
The decoder model that generates audio tokens based on conditioning signals.
|
||||||
"""
|
"""
|
||||||
if config is None and None in (text_encoder, audio_encoder, decoder):
|
if config is None and None in (text_encoder, audio_encoder, decoder):
|
||||||
|
@ -1006,10 +1006,15 @@ class OmDetTurboPreTrainedModel(PreTrainedModel):
|
|||||||
nn.init.xavier_uniform_(module.query_position_head.layers[1].weight)
|
nn.init.xavier_uniform_(module.query_position_head.layers[1].weight)
|
||||||
for layer in module.channel_projection_layers:
|
for layer in module.channel_projection_layers:
|
||||||
nn.init.xavier_uniform_(layer[0].weight)
|
nn.init.xavier_uniform_(layer[0].weight)
|
||||||
|
elif isinstance(module, OmDetTurboLanguageBackbone):
|
||||||
|
nn.init.normal_(module.text_projection, std=self.config.text_projection_in_dim**-0.5)
|
||||||
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
|
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, OmDetTurboDecoder):
|
if isinstance(module, OmDetTurboDecoder):
|
||||||
|
@ -283,6 +283,9 @@ class Qwen2AudioPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
module.weight.data.normal_(mean=0.0, std=std)
|
module.weight.data.normal_(mean=0.0, std=std)
|
||||||
if module.padding_idx is not None:
|
if module.padding_idx is not None:
|
||||||
|
@ -604,7 +604,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
|
|||||||
supports_gradient_checkpointing = True
|
supports_gradient_checkpointing = True
|
||||||
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
|
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
|
||||||
|
|
||||||
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
|
def _init_weights(self, module: nn.Module) -> None:
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
std = self.config.initializer_range
|
std = self.config.initializer_range
|
||||||
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||||
@ -615,7 +615,7 @@ class SegGptPreTrainedModel(PreTrainedModel):
|
|||||||
)
|
)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.LayerNorm):
|
elif isinstance(module, (nn.LayerNorm, SegGptLayerNorm)):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, SegGptAttention):
|
elif isinstance(module, SegGptAttention):
|
||||||
|
@ -434,6 +434,10 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
if chat_template is None and has_video:
|
if chat_template is None and has_video:
|
||||||
# re-assign to the correct default template for BC, if user is not requesting their own template
|
# re-assign to the correct default template for BC, if user is not requesting their own template
|
||||||
chat_template = DEFAULT_CHAT_TEMPLATE
|
chat_template = DEFAULT_CHAT_TEMPLATE
|
||||||
|
|
||||||
|
kwargs.setdefault("num_frames", self.video_processor.num_frames)
|
||||||
|
kwargs.setdefault("fps", self.video_processor.fps)
|
||||||
|
|
||||||
return super().apply_chat_template(conversation, chat_template, **kwargs)
|
return super().apply_chat_template(conversation, chat_template, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -551,17 +551,18 @@ class SuperGluePreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
def _init_weights(self, module: nn.Module) -> None:
|
def _init_weights(self, module: nn.Module) -> None:
|
||||||
"""Initialize the weights"""
|
"""Initialize the weights"""
|
||||||
if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d)):
|
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
if module.bias is not None:
|
if module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, nn.LayerNorm):
|
elif isinstance(module, nn.BatchNorm1d):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
elif isinstance(module, SuperGlueMultiLayerPerceptron):
|
|
||||||
nn.init.constant_(module.linear.bias, 0.0)
|
if hasattr(module, "bin_score"):
|
||||||
|
module.bin_score.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
|
@ -1097,9 +1097,13 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
processor_config=processor_dict, valid_kwargs=accepted_args_and_kwargs
|
processor_config=processor_dict, valid_kwargs=accepted_args_and_kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
# remove args that are in processor_dict to avoid duplicate arguments
|
# update args that are already in processor_dict to avoid duplicate arguments
|
||||||
args_to_remove = [i for i, arg in enumerate(accepted_args_and_kwargs) if arg in processor_dict]
|
args_to_update = {
|
||||||
args = [arg for i, arg in enumerate(args) if i not in args_to_remove]
|
i: valid_kwargs.pop(arg)
|
||||||
|
for i, arg in enumerate(accepted_args_and_kwargs)
|
||||||
|
if (arg in valid_kwargs and i < len(args))
|
||||||
|
}
|
||||||
|
args = [arg if i not in args_to_update else args_to_update[i] for i, arg in enumerate(args)]
|
||||||
|
|
||||||
# instantiate processor with used (and valid) kwargs only
|
# instantiate processor with used (and valid) kwargs only
|
||||||
processor = cls(*args, **valid_kwargs)
|
processor = cls(*args, **valid_kwargs)
|
||||||
|
@ -1163,7 +1163,7 @@ def is_flash_attn_2_available():
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache
|
||||||
def is_flash_attn_3_available():
|
def is_flash_attn_3_available():
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
return False
|
return False
|
||||||
|
@ -1786,7 +1786,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
|||||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||||
|
|
||||||
# Test output
|
# Test output
|
||||||
self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
|
expected_ids = [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118]
|
||||||
|
self.assertEqual(predictions[0].tolist(), [50265] * 32 + expected_ids) # 50265 is the img token id
|
||||||
self.assertEqual("a woman sitting on the beach with a dog", generated_text)
|
self.assertEqual("a woman sitting on the beach with a dog", generated_text)
|
||||||
|
|
||||||
# image and context
|
# image and context
|
||||||
@ -1797,10 +1798,8 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
|||||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||||
|
|
||||||
# Test output
|
# Test output
|
||||||
self.assertEqual(
|
expected_ids = [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118]
|
||||||
predictions[0].tolist(),
|
self.assertEqual(predictions[0].tolist(), [50265] * 32 + expected_ids) # 50265 is the img token id
|
||||||
[2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
|
|
||||||
)
|
|
||||||
self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
|
self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
|
||||||
|
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
@ -1826,8 +1825,17 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
|||||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||||
|
|
||||||
# Test output
|
# Test output
|
||||||
self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
|
expected_ids_and_text = Expectations(
|
||||||
self.assertEqual("woman playing with dog on the beach", generated_text)
|
{
|
||||||
|
("cuda", None): ([0, 2335, 1556, 28, 1782, 30, 8, 2608, 1], "woman playing with dog on the beach"),
|
||||||
|
("rocm", (9, 5)): (
|
||||||
|
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
|
||||||
|
"a woman is playing with her dog on the beach",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
).get_expectation()
|
||||||
|
self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
|
||||||
|
self.assertEqual(generated_text, expected_ids_and_text[1])
|
||||||
|
|
||||||
# image and context
|
# image and context
|
||||||
prompt = "Question: which city is this? Answer:"
|
prompt = "Question: which city is this? Answer:"
|
||||||
@ -1837,11 +1845,17 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
|
|||||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||||
|
|
||||||
# Test output
|
# Test output
|
||||||
self.assertEqual(
|
expected_ids_and_text = Expectations(
|
||||||
predictions[0].tolist(),
|
{
|
||||||
[0, 3, 7, 152, 67, 839, 1],
|
("cuda", None): ([0, 3, 7, 152, 67, 839, 1], "san diego"),
|
||||||
)
|
("rocm", (9, 5)): (
|
||||||
self.assertEqual(generated_text, "san diego")
|
[0, 3, 7, 152, 2515, 11389, 3523, 1],
|
||||||
|
"san francisco", # TODO: check if this is ok
|
||||||
|
),
|
||||||
|
}
|
||||||
|
).get_expectation()
|
||||||
|
self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
|
||||||
|
self.assertEqual(generated_text, expected_ids_and_text[1])
|
||||||
|
|
||||||
def test_expansion_in_processing(self):
|
def test_expansion_in_processing(self):
|
||||||
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
||||||
|
@ -570,9 +570,14 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 300, 256))
|
expected_shape = torch.Size((1, 300, 256))
|
||||||
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[0.4222, 0.7471, 0.8760], [0.6395, -0.2729, 0.7127], [-0.3090, 0.7642, 0.9529]]
|
[
|
||||||
|
[0.4223, 0.7474, 0.8760],
|
||||||
|
[0.6397, -0.2727, 0.7126],
|
||||||
|
[-0.3089, 0.7643, 0.9529],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
|
||||||
|
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
def test_inference_object_detection_head(self):
|
def test_inference_object_detection_head(self):
|
||||||
model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
|
model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to(
|
||||||
@ -592,26 +597,34 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
|
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
expected_slice_logits = torch.tensor(
|
expected_slice_logits = torch.tensor(
|
||||||
[[-10.4372, -5.7558, -8.6764], [-10.5410, -5.8704, -8.0590], [-10.6827, -6.3469, -8.3923]]
|
[
|
||||||
|
[-10.4371, -5.7565, -8.6765],
|
||||||
|
[-10.5413, -5.8700, -8.0589],
|
||||||
|
[-10.6824, -6.3477, -8.3927],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]]
|
[
|
||||||
|
[0.7733, 0.6576, 0.4496],
|
||||||
|
[0.5171, 0.1184, 0.9095],
|
||||||
|
[0.8846, 0.5647, 0.2486],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_object_detection(
|
results = image_processor.post_process_object_detection(
|
||||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device)
|
expected_scores = torch.tensor([0.8330, 0.8315, 0.8039, 0.6829, 0.5354]).to(torch_device)
|
||||||
expected_labels = [75, 17, 17, 75, 63]
|
expected_labels = [75, 17, 17, 75, 63]
|
||||||
expected_slice_boxes = torch.tensor([38.3089, 72.1022, 177.6293, 118.4512]).to(torch_device)
|
expected_slice_boxes = torch.tensor([38.3109, 72.1002, 177.6301, 118.4511]).to(torch_device)
|
||||||
|
|
||||||
self.assertEqual(len(results["scores"]), 5)
|
self.assertEqual(len(results["scores"]), 5)
|
||||||
torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(results["scores"], expected_scores, rtol=2e-4, atol=2e-4)
|
||||||
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
||||||
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes)
|
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes)
|
||||||
|
@ -286,9 +286,9 @@ class ConvNextModelIntegrationTest(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 1000))
|
expected_shape = torch.Size((1, 1000))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||||
|
|
||||||
expected_slice = torch.tensor([-0.0260, -0.4739, 0.1911]).to(torch_device)
|
expected_slice = torch.tensor([-0.0261, -0.4739, 0.1910]).to(torch_device)
|
||||||
|
|
||||||
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@ -185,6 +185,10 @@ class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|||||||
def test_model_get_set_embeddings(self):
|
def test_model_get_set_embeddings(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Larger differences on A10 than T4
|
||||||
|
def test_batching_equivalence(self, atol=2e-4, rtol=2e-4):
|
||||||
|
super().test_batching_equivalence(atol=atol, rtol=rtol)
|
||||||
|
|
||||||
def test_model(self):
|
def test_model(self):
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||||
@ -265,6 +269,6 @@ class CvtModelIntegrationTest(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 1000))
|
expected_shape = torch.Size((1, 1000))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||||
|
|
||||||
expected_slice = torch.tensor([0.9285, 0.9015, -0.3150]).to(torch_device)
|
expected_slice = torch.tensor([0.9287, 0.9016, -0.3152]).to(torch_device)
|
||||||
|
|
||||||
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
|
||||||
|
@ -758,6 +758,7 @@ def prepare_img():
|
|||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_vision
|
@require_vision
|
||||||
|
@slow
|
||||||
class DFineModelIntegrationTest(unittest.TestCase):
|
class DFineModelIntegrationTest(unittest.TestCase):
|
||||||
@cached_property
|
@cached_property
|
||||||
def default_image_processor(self):
|
def default_image_processor(self):
|
||||||
@ -778,37 +779,38 @@ class DFineModelIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_logits = torch.tensor(
|
expected_logits = torch.tensor(
|
||||||
[
|
[
|
||||||
[-3.8097816, -4.7724586, -5.994499],
|
[-3.8221, -4.7679, -6.0063],
|
||||||
[-5.2974715, -9.499067, -6.1653666],
|
[-5.2994, -9.5009, -6.1697],
|
||||||
[-5.3502765, -3.9530406, -6.3630295],
|
[-5.3103, -3.8005, -6.2972],
|
||||||
]
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
expected_boxes = torch.tensor(
|
expected_boxes = torch.tensor(
|
||||||
[
|
[
|
||||||
[0.7677696, 0.41479152, 0.46441072],
|
[0.7678, 0.4148, 0.4644],
|
||||||
[0.16912134, 0.19869131, 0.2123824],
|
[0.1691, 0.1987, 0.2124],
|
||||||
[0.2581653, 0.54818195, 0.47512347],
|
[0.2582, 0.5482, 0.4751],
|
||||||
]
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
|
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, atol=1e-4, rtol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, atol=2e-4, rtol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, 300, 4))
|
expected_shape_boxes = torch.Size((1, 300, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4, rtol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=2e-4, rtol=2e-4)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_object_detection(
|
results = image_processor.post_process_object_detection(
|
||||||
outputs, threshold=0.0, target_sizes=[image.size[::-1]]
|
outputs, threshold=0.0, target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.9642, 0.9542, 0.9536, 0.8548], device=torch_device)
|
|
||||||
|
expected_scores = torch.tensor([0.9616, 0.9541, 0.9541, 0.8551], device=torch_device)
|
||||||
expected_labels = [15, 65, 15, 57]
|
expected_labels = [15, 65, 15, 57]
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[
|
[
|
||||||
[1.3186283e01, 5.4130211e01, 3.1726535e02, 4.7212445e02],
|
[1.3358e01, 5.4123e01, 3.1726e02, 4.7222e02],
|
||||||
[4.0275269e01, 7.2975174e01, 1.7620003e02, 1.1776848e02],
|
[4.0274e01, 7.2972e01, 1.7620e02, 1.1777e02],
|
||||||
[3.4276117e02, 2.3427944e01, 6.3998401e02, 3.7477191e02],
|
[3.4270e02, 2.3427e01, 6.3998e02, 3.7476e02],
|
||||||
[5.8418274e-01, 1.1794567e00, 6.3933154e02, 4.7485995e02],
|
[5.7796e-01, 1.1773e00, 6.3933e02, 4.7486e02],
|
||||||
],
|
],
|
||||||
device=torch_device,
|
device=torch_device,
|
||||||
)
|
)
|
||||||
|
@ -787,7 +787,11 @@ class DabDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 300, 256))
|
expected_shape = torch.Size((1, 300, 256))
|
||||||
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[-0.4879, -0.2594, 0.4524], [-0.4997, -0.4258, 0.4329], [-0.8220, -0.4996, 0.0577]]
|
[
|
||||||
|
[-0.4878, -0.2593, 0.4521],
|
||||||
|
[-0.4999, -0.4257, 0.4326],
|
||||||
|
[-0.8220, -0.4997, 0.0578],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=2e-4, rtol=2e-4)
|
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=2e-4, rtol=2e-4)
|
||||||
|
|
||||||
@ -806,26 +810,34 @@ class DabDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
|
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
expected_slice_logits = torch.tensor(
|
expected_slice_logits = torch.tensor(
|
||||||
[[-10.1765, -5.5243, -8.9324], [-9.8138, -5.6721, -7.5161], [-10.3054, -5.6081, -8.5931]]
|
[
|
||||||
|
[-10.1764, -5.5247, -8.9324],
|
||||||
|
[-9.8137, -5.6730, -7.5163],
|
||||||
|
[-10.3056, -5.6075, -8.5935],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, atol=3e-4, rtol=3e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, atol=3e-4, rtol=3e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[[0.3708, 0.3000, 0.2753], [0.5211, 0.6125, 0.9495], [0.2897, 0.6730, 0.5459]]
|
[
|
||||||
|
[0.3708, 0.3000, 0.2754],
|
||||||
|
[0.5211, 0.6126, 0.9494],
|
||||||
|
[0.2897, 0.6731, 0.5460],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4, rtol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=3e-4, rtol=3e-4)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_object_detection(
|
results = image_processor.post_process_object_detection(
|
||||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.8732, 0.8563, 0.8554, 0.6079, 0.5896]).to(torch_device)
|
expected_scores = torch.tensor([0.8732, 0.8563, 0.8554, 0.6080, 0.5895]).to(torch_device)
|
||||||
expected_labels = [17, 75, 17, 75, 63]
|
expected_labels = [17, 75, 17, 75, 63]
|
||||||
expected_boxes = torch.tensor([14.6970, 49.3892, 320.5165, 469.2765]).to(torch_device)
|
expected_boxes = torch.tensor([14.6931, 49.3886, 320.5176, 469.2762]).to(torch_device)
|
||||||
|
|
||||||
self.assertEqual(len(results["scores"]), 5)
|
self.assertEqual(len(results["scores"]), 5)
|
||||||
torch.testing.assert_close(results["scores"], expected_scores, atol=1e-4, rtol=1e-4)
|
torch.testing.assert_close(results["scores"], expected_scores, atol=3e-4, rtol=3e-4)
|
||||||
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
||||||
torch.testing.assert_close(results["boxes"][0, :], expected_boxes, atol=1e-4, rtol=1e-4)
|
torch.testing.assert_close(results["boxes"][0, :], expected_boxes, atol=3e-4, rtol=3e-4)
|
||||||
|
@ -677,30 +677,38 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
|
|
||||||
expected_logits = torch.tensor(
|
expected_logits = torch.tensor(
|
||||||
[[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
|
[
|
||||||
|
[-9.6644, -4.3434, -5.8707],
|
||||||
|
[-9.7035, -3.8503, -5.0721],
|
||||||
|
[-10.5633, -5.3387, -7.5119],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
expected_boxes = torch.tensor(
|
expected_boxes = torch.tensor(
|
||||||
[[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]
|
[
|
||||||
|
[0.8693, 0.2290, 0.2492],
|
||||||
|
[0.3150, 0.5489, 0.5845],
|
||||||
|
[0.5563, 0.7580, 0.8518],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
|
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_object_detection(
|
results = image_processor.post_process_object_detection(
|
||||||
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
|
||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
|
expected_scores = torch.tensor([0.7999, 0.7895, 0.6332, 0.4719, 0.4382]).to(torch_device)
|
||||||
expected_labels = [17, 17, 75, 75, 63]
|
expected_labels = [17, 17, 75, 75, 63]
|
||||||
expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device)
|
expected_slice_boxes = torch.tensor([16.4960, 52.8387, 318.2565, 470.7831]).to(torch_device)
|
||||||
|
|
||||||
self.assertEqual(len(results["scores"]), 5)
|
self.assertEqual(len(results["scores"]), 5)
|
||||||
torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(results["scores"], expected_scores, rtol=2e-4, atol=2e-4)
|
||||||
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
||||||
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes)
|
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
def test_inference_object_detection_head_with_box_refine_two_stage(self):
|
def test_inference_object_detection_head_with_box_refine_two_stage(self):
|
||||||
model = DeformableDetrForObjectDetection.from_pretrained(
|
model = DeformableDetrForObjectDetection.from_pretrained(
|
||||||
@ -720,17 +728,25 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
|
|
||||||
expected_logits = torch.tensor(
|
expected_logits = torch.tensor(
|
||||||
[[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
|
[
|
||||||
|
[-6.7112, -4.3216, -6.3781],
|
||||||
|
[-8.9035, -6.1738, -6.7249],
|
||||||
|
[-6.9314, -4.4736, -6.2303],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
expected_boxes = torch.tensor(
|
expected_boxes = torch.tensor(
|
||||||
[[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]
|
[
|
||||||
|
[0.2582, 0.5499, 0.4683],
|
||||||
|
[0.7652, 0.9084, 0.4884],
|
||||||
|
[0.5490, 0.2763, 0.0564],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
|
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
|
def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
|
||||||
@ -753,10 +769,15 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
|
|||||||
gpu_outputs = model(pixel_values.to(torch_device), pixel_mask.to(torch_device))
|
gpu_outputs = model(pixel_values.to(torch_device), pixel_mask.to(torch_device))
|
||||||
|
|
||||||
# 3. assert equivalence
|
# 3. assert equivalence
|
||||||
|
# (on A10, the differences get larger than on T4)
|
||||||
for key in cpu_outputs.keys():
|
for key in cpu_outputs.keys():
|
||||||
assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
|
torch.testing.assert_close(cpu_outputs[key], gpu_outputs[key].cpu(), atol=2e-2, rtol=2e-2)
|
||||||
|
|
||||||
expected_logits = torch.tensor(
|
expected_logits = torch.tensor(
|
||||||
[[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
|
[
|
||||||
|
[-9.9051, -4.2541, -6.4852],
|
||||||
|
[-9.6947, -4.0854, -6.8033],
|
||||||
|
[-10.0665, -5.8470, -7.7003],
|
||||||
|
]
|
||||||
)
|
)
|
||||||
assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
|
assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=2e-4)
|
||||||
|
@ -586,9 +586,13 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 100, 256))
|
expected_shape = torch.Size((1, 100, 256))
|
||||||
assert outputs.last_hidden_state.shape == expected_shape
|
assert outputs.last_hidden_state.shape == expected_shape
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
|
[
|
||||||
|
[0.0622, -0.5142, -0.4034],
|
||||||
|
[-0.7628, -0.4935, -1.7153],
|
||||||
|
[-0.4751, -0.6386, -0.7818],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
def test_inference_object_detection_head(self):
|
def test_inference_object_detection_head(self):
|
||||||
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
|
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
|
||||||
@ -606,16 +610,24 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
|||||||
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
|
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
expected_slice_logits = torch.tensor(
|
expected_slice_logits = torch.tensor(
|
||||||
[[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
|
[
|
||||||
|
[-19.1211, -0.0881, -11.0188],
|
||||||
|
[-17.3641, -1.8045, -14.0229],
|
||||||
|
[-20.0415, -0.5833, -11.1005],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
|
[
|
||||||
|
[0.4433, 0.5302, 0.8852],
|
||||||
|
[0.5494, 0.2517, 0.0529],
|
||||||
|
[0.4998, 0.5360, 0.9955],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_object_detection(
|
results = image_processor.post_process_object_detection(
|
||||||
@ -623,12 +635,12 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
|||||||
)[0]
|
)[0]
|
||||||
expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device)
|
expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device)
|
||||||
expected_labels = [75, 75, 63, 17, 17]
|
expected_labels = [75, 75, 63, 17, 17]
|
||||||
expected_slice_boxes = torch.tensor([40.1633, 70.8115, 175.5471, 117.9841]).to(torch_device)
|
expected_slice_boxes = torch.tensor([40.1615, 70.8090, 175.5476, 117.9810]).to(torch_device)
|
||||||
|
|
||||||
self.assertEqual(len(results["scores"]), 5)
|
self.assertEqual(len(results["scores"]), 5)
|
||||||
torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(results["scores"], expected_scores, rtol=2e-4, atol=2e-4)
|
||||||
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
|
||||||
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes)
|
torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
def test_inference_panoptic_segmentation_head(self):
|
def test_inference_panoptic_segmentation_head(self):
|
||||||
model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
|
model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
|
||||||
@ -646,23 +658,27 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
|||||||
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
|
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
|
||||||
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
self.assertEqual(outputs.logits.shape, expected_shape_logits)
|
||||||
expected_slice_logits = torch.tensor(
|
expected_slice_logits = torch.tensor(
|
||||||
[[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
|
[
|
||||||
|
[-18.1523, -1.7592, -13.5019],
|
||||||
|
[-16.8866, -1.4139, -14.1025],
|
||||||
|
[-17.5735, -2.5090, -11.8666],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
|
||||||
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
|
||||||
expected_slice_boxes = torch.tensor(
|
expected_slice_boxes = torch.tensor(
|
||||||
[[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
|
[[0.5344, 0.1790, 0.9284], [0.4421, 0.0571, 0.0875], [0.6632, 0.6886, 0.1015]]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=2e-4, atol=2e-4)
|
||||||
|
|
||||||
expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
|
expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
|
||||||
self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
|
self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
|
||||||
expected_slice_masks = torch.tensor(
|
expected_slice_masks = torch.tensor(
|
||||||
[[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
|
[[-7.8408, -11.0104, -12.1279], [-12.0299, -16.6498, -17.9806], [-14.8995, -19.9940, -20.5646]]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, rtol=1e-3, atol=1e-3)
|
torch.testing.assert_close(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, rtol=2e-3, atol=2e-3)
|
||||||
|
|
||||||
# verify postprocessing
|
# verify postprocessing
|
||||||
results = image_processor.post_process_panoptic_segmentation(
|
results = image_processor.post_process_panoptic_segmentation(
|
||||||
@ -674,7 +690,7 @@ class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
|
|||||||
torch_device
|
torch_device
|
||||||
)
|
)
|
||||||
expected_number_of_segments = 5
|
expected_number_of_segments = 5
|
||||||
expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994097}
|
expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.9941}
|
||||||
|
|
||||||
number_of_unique_segments = len(torch.unique(results["segmentation"]))
|
number_of_unique_segments = len(torch.unique(results["segmentation"]))
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
@ -716,6 +732,10 @@ class DetrModelIntegrationTests(unittest.TestCase):
|
|||||||
expected_shape = torch.Size((1, 100, 256))
|
expected_shape = torch.Size((1, 100, 256))
|
||||||
assert outputs.last_hidden_state.shape == expected_shape
|
assert outputs.last_hidden_state.shape == expected_shape
|
||||||
expected_slice = torch.tensor(
|
expected_slice = torch.tensor(
|
||||||
[[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
|
[
|
||||||
|
[0.0622, -0.5142, -0.4034],
|
||||||
|
[-0.7628, -0.4935, -1.7153],
|
||||||
|
[-0.4751, -0.6386, -0.7818],
|
||||||
|
]
|
||||||
).to(torch_device)
|
).to(torch_device)
|
||||||
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
||||||
|
@ -310,12 +310,13 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
|
|
||||||
def test_feed_forward_chunking(self):
|
def test_feed_forward_chunking(self):
|
||||||
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
|
(original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
# original_config.norm_type = "time_group_norm"
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
config = copy.deepcopy(original_config)
|
config = copy.deepcopy(original_config)
|
||||||
config.chunk_length_s = None
|
config.chunk_length_s = None
|
||||||
config.overlap = None
|
config.overlap = None
|
||||||
config.sampling_rate = 10
|
config.sampling_rate = 20
|
||||||
|
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
@ -326,9 +327,9 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
hidden_states_no_chunk = model(**inputs)[1]
|
hidden_states_no_chunk = model(**inputs)[1]
|
||||||
|
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
config.chunk_length_s = 1
|
config.chunk_length_s = 2
|
||||||
config.overlap = 0
|
config.overlap = 0
|
||||||
config.sampling_rate = 10
|
config.sampling_rate = 20
|
||||||
|
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
|
@ -84,10 +84,11 @@ class EomtImageProcessingTester:
|
|||||||
"num_labels": self.num_labels,
|
"num_labels": self.num_labels,
|
||||||
}
|
}
|
||||||
|
|
||||||
def prepare_fake_eomt_outputs(self, batch_size):
|
def prepare_fake_eomt_outputs(self, batch_size, patch_offsets=None):
|
||||||
return EomtForUniversalSegmentationOutput(
|
return EomtForUniversalSegmentationOutput(
|
||||||
masks_queries_logits=torch.randn((batch_size, self.num_queries, self.height, self.width)),
|
masks_queries_logits=torch.randn((batch_size, self.num_queries, self.height, self.width)),
|
||||||
class_queries_logits=torch.randn((batch_size, self.num_queries, self.num_classes + 1)),
|
class_queries_logits=torch.randn((batch_size, self.num_queries, self.num_classes + 1)),
|
||||||
|
patch_offsets=patch_offsets,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||||
@ -263,13 +264,13 @@ class EomtImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(images=image, do_split_image=True, return_tensors="pt")
|
inputs = processor(images=image, do_split_image=True, return_tensors="pt")
|
||||||
patch_offsets = inputs.pop("patch_offsets")
|
patch_offsets = inputs["patch_offsets"]
|
||||||
|
|
||||||
original_sizes = [image.size[::-1]]
|
target_sizes = [image.size[::-1]]
|
||||||
|
|
||||||
# For semantic segmentation, the BS of output is 2 coz, two patches are created for the image.
|
# For semantic segmentation, the BS of output is 2 coz, two patches are created for the image.
|
||||||
outputs = self.image_processor_tester.prepare_fake_eomt_outputs(inputs["pixel_values"].shape[0])
|
outputs = self.image_processor_tester.prepare_fake_eomt_outputs(inputs["pixel_values"].shape[0], patch_offsets)
|
||||||
segmentation = processor.post_process_semantic_segmentation(outputs, patch_offsets, original_sizes)
|
segmentation = processor.post_process_semantic_segmentation(outputs, target_sizes)
|
||||||
|
|
||||||
self.assertEqual(segmentation[0].shape, (image.height, image.width))
|
self.assertEqual(segmentation[0].shape, (image.height, image.width))
|
||||||
|
|
||||||
|
@ -17,12 +17,13 @@ import unittest
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from transformers import AutoImageProcessor, EomtConfig, EomtForUniversalSegmentation
|
from transformers import AutoImageProcessor, EomtConfig, EomtForUniversalSegmentation, pipeline
|
||||||
from transformers.testing_utils import require_torch, require_torch_accelerator, require_torch_fp16, slow, torch_device
|
from transformers.testing_utils import require_torch, require_torch_accelerator, require_torch_fp16, slow, torch_device
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
|
||||||
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@ -100,8 +101,9 @@ class EomtForUniversalSegmentationTester:
|
|||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class EomtForUniversalSegmentationTest(ModelTesterMixin, unittest.TestCase):
|
class EomtForUniversalSegmentationTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||||
all_model_classes = (EomtForUniversalSegmentation,) if is_torch_available() else ()
|
all_model_classes = (EomtForUniversalSegmentation,) if is_torch_available() else ()
|
||||||
|
pipeline_model_mapping = {"image-segmentation": EomtForUniversalSegmentation} if is_torch_available() else {}
|
||||||
is_encoder_decoder = False
|
is_encoder_decoder = False
|
||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
@ -340,7 +342,6 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(images=image, return_tensors="pt").to(model.device)
|
inputs = processor(images=image, return_tensors="pt").to(model.device)
|
||||||
patch_offsets = inputs.pop("patch_offsets", None)
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
@ -348,11 +349,9 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
self.assertTrue(outputs.class_queries_logits.shape == (2, 100, 151))
|
self.assertTrue(outputs.class_queries_logits.shape == (2, 100, 151))
|
||||||
self.assertTrue(outputs.masks_queries_logits.shape == (2, 100, 128, 128))
|
self.assertTrue(outputs.masks_queries_logits.shape == (2, 100, 128, 128))
|
||||||
|
|
||||||
preds = processor.post_process_semantic_segmentation(
|
preds = processor.post_process_semantic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]
|
||||||
outputs, original_image_sizes=[(image.size[1], image.size[0])], patch_offsets=patch_offsets
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertTrue(preds.shape[1:] == (image.size[1], image.size[0]))
|
self.assertTrue(preds.shape == (image.size[1], image.size[0]))
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
EXPECTED_SLICE = torch.tensor([
|
EXPECTED_SLICE = torch.tensor([
|
||||||
@ -369,7 +368,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
], device=model.device)
|
], device=model.device)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
output_slice = preds[0, :10, :10]
|
output_slice = preds[:10, :10]
|
||||||
torch.testing.assert_close(output_slice, EXPECTED_SLICE, rtol=1e-2, atol=1e-2)
|
torch.testing.assert_close(output_slice, EXPECTED_SLICE, rtol=1e-2, atol=1e-2)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@ -387,9 +386,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 134))
|
self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 134))
|
||||||
self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))
|
self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))
|
||||||
|
|
||||||
preds = processor.post_process_panoptic_segmentation(
|
preds = processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]
|
||||||
outputs, original_image_sizes=[(image.size[1], image.size[0])]
|
|
||||||
)[0]
|
|
||||||
segmentation, segments_info = preds["segmentation"], preds["segments_info"]
|
segmentation, segments_info = preds["segmentation"], preds["segments_info"]
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -438,9 +435,7 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 81))
|
self.assertTrue(outputs.class_queries_logits.shape == (1, 200, 81))
|
||||||
self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))
|
self.assertTrue(outputs.masks_queries_logits.shape == (1, 200, 160, 160))
|
||||||
|
|
||||||
preds = processor.post_process_instance_segmentation(
|
preds = processor.post_process_instance_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])[0]
|
||||||
outputs, original_image_sizes=[(image.size[1], image.size[0])]
|
|
||||||
)[0]
|
|
||||||
segmentation, segments_info = preds["segmentation"], preds["segments_info"]
|
segmentation, segments_info = preds["segmentation"], preds["segments_info"]
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@ -473,3 +468,15 @@ class EomtForUniversalSegmentationIntegrationTest(unittest.TestCase):
|
|||||||
self.assertEqual(actual["id"], expected["id"])
|
self.assertEqual(actual["id"], expected["id"])
|
||||||
self.assertEqual(actual["label_id"], expected["label_id"])
|
self.assertEqual(actual["label_id"], expected["label_id"])
|
||||||
self.assertAlmostEqual(actual["score"], expected["score"], delta=1e-3)
|
self.assertAlmostEqual(actual["score"], expected["score"], delta=1e-3)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_segmentation_pipeline(self):
|
||||||
|
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
|
||||||
|
|
||||||
|
pipe = pipeline(model=self.model_id, subtask="panoptic", device=torch_device)
|
||||||
|
output = pipe(image)
|
||||||
|
|
||||||
|
EXPECTED_OUTPUT_LABELS = ["cat", "cat", "couch", "remote", "remote"]
|
||||||
|
|
||||||
|
output_labels = [segment["label"] for segment in output]
|
||||||
|
self.assertEqual(output_labels, EXPECTED_OUTPUT_LABELS)
|
||||||
|
@ -33,7 +33,7 @@ from transformers.testing_utils import (
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@ -359,9 +359,11 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "dt_proj.bias" in name:
|
if "dt_proj.bias" in name:
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
@ -380,6 +382,19 @@ class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
|
|||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if (
|
||||||
|
"mixer.conv1d.weight" in name
|
||||||
|
or "mixer.dt_proj.weight" in name
|
||||||
|
or "mixer.out_proj.weight" in name
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
# Ignore copy
|
# Ignore copy
|
||||||
|
@ -69,16 +69,15 @@ class Glm4vVisionText2TextModelTester:
|
|||||||
is_training=True,
|
is_training=True,
|
||||||
text_config={
|
text_config={
|
||||||
"vocab_size": 99,
|
"vocab_size": 99,
|
||||||
"hidden_size": 32,
|
"hidden_size": 16,
|
||||||
"intermediate_size": 37,
|
"intermediate_size": 22,
|
||||||
"num_hidden_layers": 4,
|
"num_hidden_layers": 2,
|
||||||
"num_attention_heads": 4,
|
"num_attention_heads": 2,
|
||||||
"num_key_value_heads": 2,
|
"num_key_value_heads": 1,
|
||||||
"output_channels": 64,
|
"output_channels": 64,
|
||||||
"hidden_act": "silu",
|
"hidden_act": "silu",
|
||||||
"max_position_embeddings": 512,
|
"max_position_embeddings": 512,
|
||||||
"rope_scaling": {"type": "default", "mrope_section": [2, 1, 1]},
|
"rope_scaling": {"type": "default", "mrope_section": [2, 1, 1]},
|
||||||
"max_window_layers": 3,
|
|
||||||
"rope_theta": 10000,
|
"rope_theta": 10000,
|
||||||
"tie_word_embeddings": True,
|
"tie_word_embeddings": True,
|
||||||
"bos_token_id": 0,
|
"bos_token_id": 0,
|
||||||
@ -87,11 +86,10 @@ class Glm4vVisionText2TextModelTester:
|
|||||||
},
|
},
|
||||||
vision_config={
|
vision_config={
|
||||||
"depth": 2,
|
"depth": 2,
|
||||||
"embed_dim": 32,
|
|
||||||
"hidden_act": "silu",
|
"hidden_act": "silu",
|
||||||
"hidden_size": 32,
|
"hidden_size": 48,
|
||||||
"mlp_ratio": 4,
|
"out_hidden_size": 16,
|
||||||
"num_heads": 4,
|
"intermediate_size": 22,
|
||||||
"patch_size": 14,
|
"patch_size": 14,
|
||||||
"spatial_merge_size": 1,
|
"spatial_merge_size": 1,
|
||||||
"temporal_patch_size": 2,
|
"temporal_patch_size": 2,
|
||||||
@ -239,10 +237,6 @@ class Glm4vModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase)
|
|||||||
def test_multi_gpu_data_parallel_forward(self):
|
def test_multi_gpu_data_parallel_forward(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@unittest.skip(reason="We cannot configure to output a smaller model.")
|
|
||||||
def test_model_is_small(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@unittest.skip("Error with compilation")
|
@unittest.skip("Error with compilation")
|
||||||
def test_generate_from_inputs_embeds_with_static_cache(self):
|
def test_generate_from_inputs_embeds_with_static_cache(self):
|
||||||
pass
|
pass
|
||||||
|
@ -586,6 +586,8 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
|
|||||||
or "value_proj" in name
|
or "value_proj" in name
|
||||||
or "output_proj" in name
|
or "output_proj" in name
|
||||||
or "reference_points" in name
|
or "reference_points" in name
|
||||||
|
or "vision_proj" in name
|
||||||
|
or "text_proj" in name
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
|
@ -25,6 +25,7 @@ from transformers.testing_utils import (
|
|||||||
require_read_token,
|
require_read_token,
|
||||||
require_torch,
|
require_torch,
|
||||||
require_torch_accelerator,
|
require_torch_accelerator,
|
||||||
|
run_test_using_subprocess,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
@ -96,36 +97,28 @@ class LlamaModelTest(CausalLMModelTest, unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
|
@require_read_token
|
||||||
class LlamaIntegrationTest(unittest.TestCase):
|
class LlamaIntegrationTest(unittest.TestCase):
|
||||||
|
def setup(self):
|
||||||
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
# TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
|
# TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
|
||||||
# some memory allocated in the cache, which means some object is not being released properly. This causes some
|
# some memory allocated in the cache, which means some object is not being released properly. This causes some
|
||||||
# unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
|
# unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
|
||||||
# Investigate the root cause.
|
# Investigate the root cause.
|
||||||
cleanup(torch_device, gc_collect=False)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_read_token
|
|
||||||
def test_llama_3_1_hard(self):
|
def test_llama_3_1_hard(self):
|
||||||
"""
|
"""
|
||||||
An integration test for llama 3.1. It tests against a long output to ensure the subtle numerical differences
|
An integration test for llama 3.1. It tests against a long output to ensure the subtle numerical differences
|
||||||
from llama 3.1.'s RoPE can be detected
|
from llama 3.1.'s RoPE can be detected
|
||||||
"""
|
"""
|
||||||
# diff on `EXPECTED_TEXT`:
|
|
||||||
# 2024-08-26: updating from torch 2.3.1 to 2.4.0 slightly changes the results.
|
|
||||||
expected_base_text = (
|
|
||||||
"Tell me about the french revolution. The french revolution was a period of radical political and social "
|
|
||||||
"upheaval in France that lasted from 1789 until 1799. It was a time of great change and upheaval, marked "
|
|
||||||
"by the overthrow of the monarchy, the rise of the middle class, and the eventual establishment of the "
|
|
||||||
"First French Republic.\nThe revolution began in 1789 with the Estates-General, a representative "
|
|
||||||
"assembly that had not met since 1614. The Third Estate, which represented the common people, "
|
|
||||||
"demanded greater representation and eventually broke away to form the National Assembly. This marked "
|
|
||||||
"the beginning of the end of the absolute monarchy and the rise of the middle class.\n"
|
|
||||||
)
|
|
||||||
expected_texts = Expectations(
|
expected_texts = Expectations(
|
||||||
{
|
{
|
||||||
("rocm", (9, 5)): expected_base_text.replace("political and social", "social and political"),
|
("rocm", (9, 5)): 'Tell me about the french revolution. The french revolution was a period of radical social and political upheaval in France that lasted from 1789 until 1799. It was a time of great change and upheaval, marked by the overthrow of the monarchy, the rise of the middle class, and the eventual establishment of the First French Republic.\nThe revolution began in 1789 with the Estates-General, a representative assembly that had not met since 1614. The Third Estate, which represented the common people, demanded greater representation and eventually broke away to form the National Assembly. This marked the beginning of the end of the absolute monarchy and the rise of the middle class.\n',
|
||||||
("cuda", None): expected_base_text,
|
("cuda", None): 'Tell me about the french revolution. The french revolution was a period of radical political and social upheaval in France that lasted from 1789 until 1799. It was a time of great change and upheaval, marked by the overthrow of the monarchy, the rise of the middle class, and the eventual establishment of the First French Republic.\nThe revolution began in 1789 with the Estates-General, a representative assembly that had not met since 1614. The Third Estate, which represented the common people, demanded greater representation and eventually broke away to form the National Assembly. The National Assembly adopted the Declaration of the Rights of Man and of the Citizen, which enshr',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
EXPECTED_TEXT = expected_texts.get_expectation()
|
EXPECTED_TEXT = expected_texts.get_expectation()
|
||||||
@ -142,7 +135,6 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
self.assertEqual(generated_text, EXPECTED_TEXT)
|
self.assertEqual(generated_text, EXPECTED_TEXT)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_read_token
|
|
||||||
def test_model_7b_logits_bf16(self):
|
def test_model_7b_logits_bf16(self):
|
||||||
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
|
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
|
||||||
|
|
||||||
@ -191,7 +183,6 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_read_token
|
|
||||||
def test_model_7b_logits(self):
|
def test_model_7b_logits(self):
|
||||||
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
|
input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
|
||||||
|
|
||||||
@ -240,6 +231,9 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: check why we have the following strange situation.
|
||||||
|
# without running in subprocess, this test causes subsequent tests failing with `RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0!`
|
||||||
|
@run_test_using_subprocess
|
||||||
@slow
|
@slow
|
||||||
def test_model_7b_dola_generation(self):
|
def test_model_7b_dola_generation(self):
|
||||||
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
|
# ground truth text generated with dola_layers="low", repetition_penalty=1.2
|
||||||
@ -265,7 +259,6 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
@require_read_token
|
|
||||||
def test_compile_static_cache(self):
|
def test_compile_static_cache(self):
|
||||||
# `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
|
# `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
|
||||||
# work as intended. See https://github.com/pytorch/pytorch/issues/121943
|
# work as intended. See https://github.com/pytorch/pytorch/issues/121943
|
||||||
@ -306,7 +299,6 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
|
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_read_token
|
|
||||||
def test_export_static_cache(self):
|
def test_export_static_cache(self):
|
||||||
if version.parse(torch.__version__) < version.parse("2.4.0"):
|
if version.parse(torch.__version__) < version.parse("2.4.0"):
|
||||||
self.skipTest(reason="This test requires torch >= 2.4 to run.")
|
self.skipTest(reason="This test requires torch >= 2.4 to run.")
|
||||||
|
@ -407,6 +407,8 @@ class LlamaIntegrationTest(unittest.TestCase):
|
|||||||
self.tokenizer.add_eos_token = False
|
self.tokenizer.add_eos_token = False
|
||||||
self.rust_tokenizer.add_eos_token = False
|
self.rust_tokenizer.add_eos_token = False
|
||||||
|
|
||||||
|
# See internal discussion: https://huggingface.slack.com/archives/C01NE71C4F7/p1750680376085749?thread_ts=1750676268.233309&cid=C01NE71C4F7
|
||||||
|
@unittest.skip("failing, won't fix")
|
||||||
@slow
|
@slow
|
||||||
def test_conversion(self):
|
def test_conversion(self):
|
||||||
# This is excruciatingly slow since it has to recreate the entire merge
|
# This is excruciatingly slow since it has to recreate the entire merge
|
||||||
|
@ -24,7 +24,7 @@ from transformers.testing_utils import require_torch, require_torch_multi_gpu, s
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@ -326,9 +326,11 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "dt_proj.bias" in name:
|
if "dt_proj.bias" in name:
|
||||||
dt = torch.exp(
|
dt = torch.exp(
|
||||||
@ -347,6 +349,19 @@ class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
|||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if (
|
||||||
|
"mixer.conv1d.weight" in name
|
||||||
|
or "mixer.dt_proj.weight" in name
|
||||||
|
or "mixer.out_proj.weight" in name
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import math
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import AutoTokenizer, Mamba2Config, is_torch_available
|
from transformers import AutoTokenizer, Mamba2Config, is_torch_available
|
||||||
@ -28,7 +29,7 @@ from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba
|
|||||||
|
|
||||||
from ...generation.test_utils import GenerationTesterMixin
|
from ...generation.test_utils import GenerationTesterMixin
|
||||||
from ...test_configuration_common import ConfigTester
|
from ...test_configuration_common import ConfigTester
|
||||||
from ...test_modeling_common import ModelTesterMixin, ids_tensor
|
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
|
||||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||||
|
|
||||||
|
|
||||||
@ -276,14 +277,37 @@ class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
|||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.rescale_prenorm_residual = True
|
||||||
|
|
||||||
|
configs_no_init = _config_zero_init(config)
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=config)
|
model = model_class(config=configs_no_init)
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if "D" in name:
|
if "dt_proj.bias" in name:
|
||||||
|
dt = torch.exp(
|
||||||
|
torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
|
||||||
|
+ math.log(config.time_step_min)
|
||||||
|
).clamp(min=config.time_step_floor)
|
||||||
|
inv_dt = dt + torch.log(-torch.expm1(-dt))
|
||||||
|
if param.requires_grad:
|
||||||
|
self.assertTrue(param.data.max().item() <= inv_dt[1])
|
||||||
|
self.assertTrue(param.data.min().item() >= inv_dt[0])
|
||||||
|
elif "A_log" in name:
|
||||||
|
A = torch.arange(1, config.num_heads + 1)
|
||||||
|
torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5)
|
||||||
|
elif "D" in name:
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
# check if it's a ones like
|
# check if it's a ones like
|
||||||
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5)
|
||||||
|
else:
|
||||||
|
if param.requires_grad:
|
||||||
|
if "mixer.conv1d.weight" in name or "mixer.dt_bias" in name or "mixer.out_proj.weight" in name:
|
||||||
|
continue
|
||||||
|
self.assertIn(
|
||||||
|
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||||
|
[0.0, 1.0],
|
||||||
|
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||||
|
)
|
||||||
|
|
||||||
@unittest.skip(reason="Mamba 2 weights are not tied")
|
@unittest.skip(reason="Mamba 2 weights are not tied")
|
||||||
def test_tied_weights_keys(self):
|
def test_tied_weights_keys(self):
|
||||||
|
@ -629,6 +629,7 @@ class OmDetTurboModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
|
|||||||
or "decoder.channel_projection_layers" in name
|
or "decoder.channel_projection_layers" in name
|
||||||
or "query_position_head" in name
|
or "query_position_head" in name
|
||||||
or "decoder.encoder_vision_features" in name
|
or "decoder.encoder_vision_features" in name
|
||||||
|
or "language_backbone.text_projection" in name
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
|
@ -536,23 +536,24 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
).content
|
).content
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.image2 = Image.open(
|
|
||||||
BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
|
self.video_messages = [
|
||||||
)
|
{
|
||||||
self.image3 = Image.open(
|
"role": "user",
|
||||||
BytesIO(
|
"content": [
|
||||||
requests.get(
|
{
|
||||||
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
|
"type": "video",
|
||||||
).content
|
"path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov",
|
||||||
)
|
},
|
||||||
)
|
{"type": "text", "text": "Describe this video in detail"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
cleanup(torch_device, gc_collect=True)
|
cleanup(torch_device, gc_collect=True)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
# TODO (Orr?) this is a dummy test to check if the model generates things that make sense.
|
|
||||||
# Needs to be expanded to a tiny video
|
|
||||||
def test_integration_test(self):
|
def test_integration_test(self):
|
||||||
model = SmolVLMForConditionalGeneration.from_pretrained(
|
model = SmolVLMForConditionalGeneration.from_pretrained(
|
||||||
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
||||||
@ -571,3 +572,26 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
|
expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
|
||||||
self.assertEqual(generated_texts[0], expected_generated_text)
|
self.assertEqual(generated_texts[0], expected_generated_text)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_integration_test_video(self):
|
||||||
|
model = SmolVLMForConditionalGeneration.from_pretrained(
|
||||||
|
"HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create inputs
|
||||||
|
inputs = self.processor.apply_chat_template(
|
||||||
|
self.video_messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
).to(device=torch_device, dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=20)
|
||||||
|
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||||
|
|
||||||
|
expected_generated_text = 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature' # fmt: skip
|
||||||
|
self.assertEqual(generated_texts[0], expected_generated_text)
|
||||||
|
@ -153,10 +153,18 @@ class TimmWrapperModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
|
|||||||
def test_retain_grad_hidden_states_attentions(self):
|
def test_retain_grad_hidden_states_attentions(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
|
def test_can_init_all_missing_weights(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip(reason="TimmWrapper initialization is managed on the timm side")
|
||||||
|
def test_mismatched_shapes_have_properly_initialized_weights(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
|
@unittest.skip(reason="Need to use a timm model and there is no tiny model available.")
|
||||||
def test_model_is_small(self):
|
def test_model_is_small(self):
|
||||||
pass
|
pass
|
||||||
|
@ -27,6 +27,7 @@ from transformers import (
|
|||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
pipeline,
|
pipeline,
|
||||||
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.models.opt.modeling_opt import OPTAttention
|
from transformers.models.opt.modeling_opt import OPTAttention
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
@ -111,6 +112,8 @@ class Base4bitTest(unittest.TestCase):
|
|||||||
EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
|
EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
|
||||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am 25 years old.")
|
EXPECTED_OUTPUTS.add("Hello my name is John and I am 25 years old.")
|
||||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a student at the University of")
|
EXPECTED_OUTPUTS.add("Hello my name is John and I am a student at the University of")
|
||||||
|
# Expected values on Intel XPU and NV A100
|
||||||
|
EXPECTED_OUTPUTS.add("Hello my name is Alina. I have been working as a professional")
|
||||||
MAX_NEW_TOKENS = 10
|
MAX_NEW_TOKENS = 10
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
@ -513,6 +516,8 @@ class Pipeline4BitTest(Base4bitTest):
|
|||||||
max_new_tokens=self.MAX_NEW_TOKENS,
|
max_new_tokens=self.MAX_NEW_TOKENS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Avoid sampling different outputs
|
||||||
|
set_seed(42)
|
||||||
# Real second forward pass
|
# Real second forward pass
|
||||||
pipeline_output = self.pipe(self.input_text)
|
pipeline_output = self.pipe(self.input_text)
|
||||||
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
||||||
|
@ -27,6 +27,7 @@ from transformers import (
|
|||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
pipeline,
|
pipeline,
|
||||||
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.models.opt.modeling_opt import OPTAttention
|
from transformers.models.opt.modeling_opt import OPTAttention
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
@ -113,6 +114,8 @@ class BaseMixedInt8Test(unittest.TestCase):
|
|||||||
MAX_NEW_TOKENS = 10
|
MAX_NEW_TOKENS = 10
|
||||||
# Expected values with offload
|
# Expected values with offload
|
||||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer based in")
|
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer based in")
|
||||||
|
# Expected values on Intel XPU and NV A100
|
||||||
|
EXPECTED_OUTPUTS.add("Hello my name is Alina. I have been working as a professional")
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
# Models and tokenizer
|
# Models and tokenizer
|
||||||
@ -649,6 +652,8 @@ class MixedInt8TestPipeline(BaseMixedInt8Test):
|
|||||||
max_new_tokens=self.MAX_NEW_TOKENS,
|
max_new_tokens=self.MAX_NEW_TOKENS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Avoid sampling different outputs
|
||||||
|
set_seed(42)
|
||||||
# Real second forward pass
|
# Real second forward pass
|
||||||
pipeline_output = self.pipe(self.input_text)
|
pipeline_output = self.pipe(self.input_text)
|
||||||
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
||||||
|
@ -855,7 +855,7 @@ class ModelTesterMixin:
|
|||||||
# For now, skip everything older than 2025 and "important models" (too much models to patch otherwise)
|
# For now, skip everything older than 2025 and "important models" (too much models to patch otherwise)
|
||||||
# Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them
|
# Use `supports_cache_class` as a proxy to judge "important" models in order to prioritize them
|
||||||
# TODO: relax this as we patch more and more models
|
# TODO: relax this as we patch more and more models
|
||||||
if addition_year < 2025 and not model_class._supports_cache_class:
|
if addition_year < 2024 and not model_class._supports_cache_class:
|
||||||
self.skipTest(reason=f"{model_class} is not a priorited model for now.")
|
self.skipTest(reason=f"{model_class} is not a priorited model for now.")
|
||||||
|
|
||||||
# Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps
|
# Monkey patch the method to add a seed (we do it on PreTrainedModel._initialize_weights, which wraps
|
||||||
@ -895,6 +895,11 @@ class ModelTesterMixin:
|
|||||||
model_from_config.state_dict().items(), model_from_pretrained.state_dict().items()
|
model_from_config.state_dict().items(), model_from_pretrained.state_dict().items()
|
||||||
):
|
):
|
||||||
self.assertEqual(k1, k2, "The keys from each model should be the same")
|
self.assertEqual(k1, k2, "The keys from each model should be the same")
|
||||||
|
|
||||||
|
# In case using torch.nn.utils.parametrizations on a module, we should skip the resulting keys
|
||||||
|
if re.search(r"\.parametrizations\..*?\.original[01]", k1):
|
||||||
|
continue
|
||||||
|
|
||||||
# Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due
|
# Since we added the seed, they should be exactly the same (i.e. using allclose maybe be wrong due
|
||||||
# to very low std in init function)
|
# to very low std in init function)
|
||||||
if not (v1 == v2).all():
|
if not (v1 == v2).all():
|
||||||
|
@ -351,6 +351,18 @@ class ProcessorTesterMixin:
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_args_overlap_kwargs(self):
|
||||||
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
|
processor_first = self.get_processor()
|
||||||
|
image_processor = processor_first.image_processor
|
||||||
|
image_processor.is_override = True
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
processor_first.save_pretrained(tmpdirname)
|
||||||
|
processor_second = self.processor_class.from_pretrained(tmpdirname, image_processor=image_processor)
|
||||||
|
self.assertTrue(processor_second.image_processor.is_override)
|
||||||
|
|
||||||
def test_structured_kwargs_nested(self):
|
def test_structured_kwargs_nested(self):
|
||||||
if "image_processor" not in self.processor_class.attributes:
|
if "image_processor" not in self.processor_class.attributes:
|
||||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||||
|
133
utils/get_pr_run_slow_jobs.py
Normal file
133
utils/get_pr_run_slow_jobs.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
MAX_NUM_JOBS_TO_SUGGEST = 16
|
||||||
|
|
||||||
|
|
||||||
|
def get_jobs_to_run():
|
||||||
|
# The file `pr_files.txt` contains the information about the files changed in a pull request, and it is prepared by
|
||||||
|
# the caller (using GitHub api).
|
||||||
|
# We can also use the following api to get the information if we don't have them before calling this script.
|
||||||
|
# url = f"https://api.github.com/repos/huggingface/transformers/pulls/PULL_NUMBER/files?ref={pr_sha}"
|
||||||
|
with open("pr_files.txt") as fp:
|
||||||
|
pr_files = json.load(fp)
|
||||||
|
pr_files = [{k: v for k, v in item.items() if k in ["filename", "status"]} for item in pr_files]
|
||||||
|
pr_files = [item["filename"] for item in pr_files if item["status"] in ["added", "modified"]]
|
||||||
|
|
||||||
|
# models or quantizers
|
||||||
|
re_1 = re.compile(r"src/transformers/(models/.*)/modeling_.*\.py")
|
||||||
|
re_2 = re.compile(r"src/transformers/(quantizers/quantizer_.*)\.py")
|
||||||
|
|
||||||
|
# tests for models or quantizers
|
||||||
|
re_3 = re.compile(r"tests/(models/.*)/test_.*\.py")
|
||||||
|
re_4 = re.compile(r"tests/(quantization/.*)/test_.*\.py")
|
||||||
|
|
||||||
|
# files in a model directory but not necessary a modeling file
|
||||||
|
re_5 = re.compile(r"src/transformers/(models/.*)/.*\.py")
|
||||||
|
|
||||||
|
regexes = [re_1, re_2, re_3, re_4, re_5]
|
||||||
|
|
||||||
|
jobs_to_run = []
|
||||||
|
for pr_file in pr_files:
|
||||||
|
for regex in regexes:
|
||||||
|
matched = regex.findall(pr_file)
|
||||||
|
if len(matched) > 0:
|
||||||
|
item = matched[0]
|
||||||
|
item = item.replace("quantizers/quantizer_", "quantization/")
|
||||||
|
# TODO: for files in `quantizers`, the processed item above may not exist. Try using a fuzzy matching
|
||||||
|
if item in repo_content:
|
||||||
|
jobs_to_run.append(item)
|
||||||
|
break
|
||||||
|
jobs_to_run = sorted(set(jobs_to_run))
|
||||||
|
|
||||||
|
return jobs_to_run
|
||||||
|
|
||||||
|
|
||||||
|
def parse_message(message: str) -> str:
|
||||||
|
"""
|
||||||
|
Parses a GitHub pull request's comment to find the models specified in it to run slow CI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message (`str`): The body of a GitHub pull request's comment.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`str`: The substring in `message` after `run-slow`, run_slow` or run slow`. If no such prefix is found, the
|
||||||
|
empty string is returned.
|
||||||
|
"""
|
||||||
|
if message is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
message = message.strip().lower()
|
||||||
|
|
||||||
|
# run-slow: model_1, model_2, quantization_1, quantization_2
|
||||||
|
if not message.startswith(("run-slow", "run_slow", "run slow")):
|
||||||
|
return ""
|
||||||
|
message = message[len("run slow") :]
|
||||||
|
# remove leading `:`
|
||||||
|
while message.strip().startswith(":"):
|
||||||
|
message = message.strip()[1:]
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def get_jobs(message: str):
|
||||||
|
models = parse_message(message)
|
||||||
|
return models.replace(",", " ").split()
|
||||||
|
|
||||||
|
|
||||||
|
def check_name(model_name: str):
|
||||||
|
allowed = string.ascii_letters + string.digits + "_"
|
||||||
|
return not (model_name.startswith("_") or model_name.endswith("_")) and all(c in allowed for c in model_name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--message", type=str, default="", help="The content of a comment.")
|
||||||
|
parser.add_argument("--quantization", action="store_true", help="If we collect quantization tests")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# The files are prepared by the caller (using GitHub api).
|
||||||
|
# We can also use the following api to get the information if we don't have them before calling this script.
|
||||||
|
# url = f"https://api.github.com/repos/OWNER/REPO/contents/PATH?ref={pr_sha}"
|
||||||
|
# (we avoid to checkout the repository using `actions/checkout` to reduce the run time, but mostly to avoid the potential security issue as much as possible)
|
||||||
|
repo_content = []
|
||||||
|
for filename in ["tests_dir.txt", "tests_models_dir.txt", "tests_quantization_dir.txt"]:
|
||||||
|
with open(filename) as fp:
|
||||||
|
data = json.load(fp)
|
||||||
|
data = [item["path"][len("tests/") :] for item in data if item["type"] == "dir"]
|
||||||
|
repo_content.extend(data)
|
||||||
|
|
||||||
|
# These don't have the prefix `models/` or `quantization/`, so we need to add them.
|
||||||
|
if args.message:
|
||||||
|
specified_jobs = get_jobs(args.message)
|
||||||
|
specified_jobs = [job for job in specified_jobs if check_name(job)]
|
||||||
|
|
||||||
|
# Add prefix (`models/` or `quantization`)
|
||||||
|
jobs_to_run = []
|
||||||
|
for job in specified_jobs:
|
||||||
|
if not args.quantization:
|
||||||
|
if f"models/{job}" in repo_content:
|
||||||
|
jobs_to_run.append(f"models/{job}")
|
||||||
|
elif job in repo_content and job != "quantization":
|
||||||
|
jobs_to_run.append(job)
|
||||||
|
elif f"quantization/{job}" in repo_content:
|
||||||
|
jobs_to_run.append(f"quantization/{job}")
|
||||||
|
|
||||||
|
print(sorted(set(jobs_to_run)))
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Compute (from the added/modified files) the directories under `tests/`, `tests/models/` and `tests/quantization`to run tests.
|
||||||
|
# These are already with the prefix `models/` or `quantization/`, so we don't need to add them.
|
||||||
|
jobs_to_run = get_jobs_to_run()
|
||||||
|
jobs_to_run = [x.replace("models/", "").replace("quantization/", "") for x in jobs_to_run]
|
||||||
|
jobs_to_run = [job for job in jobs_to_run if check_name(job)]
|
||||||
|
|
||||||
|
if len(jobs_to_run) > MAX_NUM_JOBS_TO_SUGGEST:
|
||||||
|
jobs_to_run = jobs_to_run[:MAX_NUM_JOBS_TO_SUGGEST]
|
||||||
|
|
||||||
|
suggestion = f"{', '.join(jobs_to_run)}"
|
||||||
|
|
||||||
|
print(suggestion)
|
Loading…
Reference in New Issue
Block a user