Merge branch 'main' into add-aimv2-model

This commit is contained in:
Yaswanth Gali 2025-05-13 13:18:06 +05:30 committed by GitHub
commit 367597cc75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
640 changed files with 43076 additions and 53299 deletions

View File

@ -7,6 +7,18 @@ parameters:
nightly:
type: boolean
default: false
GHA_Actor:
type: string
default: ""
GHA_Action:
type: string
default: ""
GHA_Event:
type: string
default: ""
GHA_Meta:
type: string
default: ""
jobs:
# Ensure running with CircleCI/huggingface
@ -31,8 +43,12 @@ jobs:
parallelism: 1
steps:
- checkout
- run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
- run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
- run: git branch
- run: git log -n 1
- run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
- run: echo $(cat pr_number.txt)
- run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
- run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
- run: cat github.txt
- run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
- run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi

View File

@ -110,6 +110,7 @@ class CircleCIJob:
print(f"Using {self.docker_image} docker image")
if self.install_steps is None:
self.install_steps = ["uv venv && uv pip install ."]
self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
if self.pytest_options is None:
self.pytest_options = {}
if isinstance(self.tests_to_run, str):
@ -397,7 +398,12 @@ def create_circleci_config(folder=None):
"parameters": {
# Only used to accept the parameters from the trigger
"nightly": {"type": "boolean", "default": False},
"tests_to_run": {"type": "string", "default": ''},
# Only used to accept the parameters from GitHub Actions trigger
"GHA_Actor": {"type": "string", "default": ""},
"GHA_Action": {"type": "string", "default": ""},
"GHA_Event": {"type": "string", "default": ""},
"GHA_Meta": {"type": "string", "default": ""},
"tests_to_run": {"type": "string", "default": ""},
**{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
**{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
},

View File

@ -59,7 +59,7 @@ jobs:
"type": "section",
"text": {
"type": "mrkdwn",
"text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
"text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
}
}
]

View File

@ -145,7 +145,7 @@ jobs:
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MODELS: ${{ needs.get-tests.outputs.models }}
BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
run: |
gh api \
--method POST \

20
.github/workflows/trigger_circleci.yml vendored Normal file
View File

@ -0,0 +1,20 @@
name: Trigger CircleCI
on:
pull_request_target:
types: [ready_for_review]
jobs:
trigger-circleci:
runs-on: ubuntu-22.04
steps:
- name: trigger CircleCI pipeline via GitHub Actions
uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.2.0
with:
GHA_Meta: "Trigger via GitHub Actions"
target-slug: "github/huggingface/transformers"
target-branch: "pull/${{ github.event.number }}/head"
env:
CCI_TOKEN: ${{ secrets.CIRCLECI_PAT }}

View File

@ -78,7 +78,6 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
# venv
python -m venv .my-env
source .my-env/bin/activate
# uv
uv venv .my-env
source .my-env/bin/activate
@ -88,10 +87,10 @@ Install Transformers in your virtual environment.
```py
# pip
pip install transformers
pip install "transformers[torch]"
# uv
uv pip install transformers
uv pip install "transformers[torch]"
```
Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -99,7 +98,7 @@ Install Transformers from source if you want the latest changes in the library o
```shell
git clone https://github.com/huggingface/transformers.git
cd transformers
pip install .
pip install .[torch]
```
## Quickstart
@ -121,7 +120,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
> [!TIP]
> You can also chat with a model directly from the command line.
> ```shell
> transformers chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
> transformers chat Qwen/Qwen2.5-0.5B-Instruct
> ```
```py

View File

@ -71,6 +71,9 @@ RUN python3 -m pip install --no-cache-dir g2p-en
# For Some bitsandbytes tests
RUN python3 -m pip install --no-cache-dir einops
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -45,6 +45,9 @@ RUN python3 -m pip uninstall -y deepspeed
# TODO: Find out why test fail.
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -57,6 +57,9 @@ RUN python3 -m pip uninstall -y deepspeed
#RUN git clone https://github.com/pytorch/TensorRT.git
#RUN cd TensorRT/py && python3 setup.py install --fx-only
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -28,6 +28,9 @@ RUN python3 -m pip uninstall -y tensorflow flax
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install -U "itsdangerous<2.1.0"
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -90,6 +90,9 @@ RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
# Add transformers in editable mode
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

View File

@ -21,6 +21,8 @@
title: Adding a new model to Transformers
- local: modular_transformers
title: Modular Transformers
- local: auto_docstring
title: Document your models
- local: task_summary
title: What 🤗 Transformers can do
- local: tasks_explained
@ -37,6 +39,8 @@
title: Tokenizers
- local: image_processors
title: Image processors
- local: video_processors
title: Video processors
- local: backbones
title: Backbones
- local: feature_extractors
@ -360,7 +364,9 @@
title: Feature Extractor
- local: main_classes/image_processor
title: Image Processor
title: Main classes
- local: main_classes/video_processor
title: Video Processor
title: Main Classes
- sections:
- sections:
- local: model_doc/albert
@ -495,6 +501,8 @@
title: Granite
- local: model_doc/granitemoe
title: GraniteMoe
- local: model_doc/granitemoehybrid
title: GraniteMoeHybrid
- local: model_doc/granitemoeshared
title: GraniteMoeShared
- local: model_doc/helium
@ -823,6 +831,8 @@
title: Bark
- local: model_doc/clap
title: CLAP
- local: model_doc/csm
title: CSM
- local: model_doc/dac
title: dac
- local: model_doc/encodec

View File

@ -108,7 +108,7 @@ If in doubt about what args/kwargs a given model sends to the attention function
## Accessing current available implementations
Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
would expect from a usual Python dictionary:
```python

View File

@ -0,0 +1,279 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Utilizing the @auto_docstring Decorator
The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
---
## 📜 How it Works
The `@auto_docstring` decorator constructs docstrings by:
1. **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
2. **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
3. **Overriding or Adding Arguments Descriptions:**
* **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
* **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
4. **Adding Classes and Functions Introduction:**
* **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
* **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
5. **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
6. **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
7. **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
8. **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
---
## 🚀 How to Use @auto_docstring
### 1. Importing the Decorator
Import the decorator into your modeling file:
```python
from ...utils import auto_docstring
```
### 2. Applying to Classes
Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
```python
from transformers.modeling_utils import PreTrainedModel
from ...utils import auto_docstring
@auto_docstring
class MyAwesomeModel(PreTrainedModel):
def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
r"""
custom_parameter (`int`, *optional*, defaults to 10):
Description of the custom_parameter for MyAwesomeModel.
another_custom_arg (`str`, *optional*, defaults to "default"):
Documentation for another unique argument.
"""
super().__init__(config)
self.custom_parameter = custom_parameter
self.another_custom_arg = another_custom_arg
# ... rest of your init
# ... other methods
```
#### Advanced Class Decoration:
Arguments can be passed directly to `@auto_docstring` for more control:
```python
@auto_docstring(
custom_intro="""This model performs specific synergistic operations.
It builds upon the standard Transformer architecture with unique modifications.""",
custom_args="""
custom_parameter (`type`, *optional*, defaults to `default_value`):
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
"""
)
class MySpecialModel(PreTrainedModel):
def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
# ...
```
Or:
```python
@auto_docstring(
custom_intro="""This model performs specific synergistic operations.
It builds upon the standard Transformer architecture with unique modifications.""",
)
class MySpecialModel(PreTrainedModel):
def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
r"""
custom_parameter (`type`, *optional*, defaults to `default_value`):
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
"""
# ...
```
### 3. Applying to Functions (e.g., `forward` method)
Apply the decorator above method definitions, such as the `forward` method.
```python
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
new_custom_argument: Optional[torch.Tensor] = None,
arg_documented_in_args_doc: Optional[torch.Tensor] = None,
# ... other arguments
) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
r"""
new_custom_argument (`torch.Tensor`, *optional*):
Description of this new custom argument and its expected shape or type.
"""
# ...
```
#### Advanced Function Decoration:
Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
```python
MODEL_COMMON_CUSTOM_ARGS = r"""
common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
Description of common_arg_1
common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
Description of common_arg_2
...
"""
class MyModel(PreTrainedModel):
# ...
@auto_docstring(
custom_intro="""
This is a custom introduction for the function.
"""
custom_args=MODEL_COMMON_CUSTOM_ARGS
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
common_arg_1: Optional[torch.Tensor] = None,
common_arg_2: Optional[torch.Tensor] = None,
#...
function_specific_argument: Optional[torch.Tensor] = None,
# ... other arguments
) -> torch.Tensor:
r"""
function_specific_argument (`torch.Tensor`, *optional*):
Description of an argument specific to this function
Returns:
`torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
Example:
(To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
```python
...
```
"""
# ...
```
---
### ✍️ Documenting Arguments: Approach & Priority
1. **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
* `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
2. **New or Custom Arguments:**
* **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
* **Format:**
```
argument_name (`type`, *optional*, defaults to `X`):
Description of the argument.
Explain its purpose, expected shape/type if complex, and default behavior.
This can span multiple lines.
```
* Include `type` in backticks.
* Add "*optional*" if the argument is not required (has a default value).
* Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
3. **Overriding Standard Arguments:**
* If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
* The `labels` argument is often customized per model and typically requires a specific docstring.
4. **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
* New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
---
### Usage with [modular files](./modular_transformers)
When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
- **For standalone models in modular files:**
Apply the `@auto_docstring` decorator just as you would in regular modeling files.
- **For models inheriting from other library models:**
- When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
- If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
> **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
---
## ✅ Checking Your Docstrings with `check_auto_docstrings`
The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
#### What it Checks:
* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
* **Argument Completeness & Consistency:**
* Flags arguments in the signature that are not known standard arguments and lack a local description.
* Ensures documented arguments exist in the signature. (TODO)
* Verifies that types and default values in the docstring match the signature. (TODO)
* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
* **Formatting:** Adherence to the expected docstring style.
#### Running the Check Locally:
Run this check locally before committing. The common command is:
```bash
make fix-copies
```
Alternatively, to only perform docstrings and auto-docstring checks, you can use:
```bash
python utils/check_docstrings.py # to only check files included in the diff without fixing them
# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
```
#### Workflow with the Checker:
1. Add `@auto_docstring(...)` to the class or method.
2. For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
3. Run `make fix-copies` (or the `check_docstrings.py` utility).
* For unrecognized arguments lacking documentation, the utility will create placeholder entries.
4. Manually edit these placeholders with accurate types and descriptions.
5. Re-run the check to ensure all issues are resolved.
---
## 🔑 Key Takeaways & Best Practices
* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
* Document new or custom arguments clearly.
* Run `check_docstrings` locally and iteratively.
By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.

View File

@ -27,7 +27,7 @@ This guide shows you how to quickly start chatting with Transformers from the co
## transformers CLI
Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
```bash
transformers chat Qwen/Qwen2.5-0.5B-Instruct
@ -37,6 +37,12 @@ transformers chat Qwen/Qwen2.5-0.5B-Instruct
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
</div>
You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
```bash
transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
```
For a full list of options, run the command below.
```bash

View File

@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
# Image processors
Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
- [`~BaseImageProcessor.center_crop`] to resize an image
- [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values

View File

@ -84,6 +84,19 @@ class Trainer:
Backends that can be added here are all the backends that are available in the `import_utils.py` module.
Additionally, specific versions can be specified in each backend. For example, this is how you would specify
a requirement on torch>=2.6 on the `Trainer` class:
```python
from .utils.import_utils import requires
@requires(backends=("torch>=2.6", "accelerate"))
class Trainer:
...
```
You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
## Methods
[[autodoc]] utils.import_utils.define_import_structure

View File

@ -20,9 +20,13 @@ rendered properly in your Markdown viewer.
Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.
In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.
In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
> [!TIP]
> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
> ```shell
> transformers chat Qwen/Qwen2.5-0.5B-Instruct
> ```
## Default generate
@ -134,6 +138,20 @@ outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
```
## Common Options
[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
| Option name | Type | Simplified description |
|---|---|---|
| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
## Pitfalls
The section below covers some common issues you may encounter during text generation and how to solve them.
@ -286,4 +304,4 @@ Take a look below for some more specific and specialized text generation librari
- [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
- [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
- [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.

View File

@ -0,0 +1,55 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Video Processor
A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch.
The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't upadted your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
### Usage Example
Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
```python
from transformers import AutoVideoProcessor
processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
```
Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
```python
import torch
from transformers.video_utils import load_video
from transformers import AutoVideoProcessor
video = load_video("video.mp4")
processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
processor = torch.compile(processor)
processed_video = processor(video, return_tensors="pt")
```
## BaseVideoProcessor
[[autodoc]] video_processing_utils.BaseVideoProcessor

View File

@ -102,6 +102,10 @@ response = processor.decode(output_ids, skip_special_tokens=True)
[[autodoc]] AriaTextModel
## AriaModel
[[autodoc]] AriaModel
## AriaTextForCausalLM
[[autodoc]] AriaTextForCausalLM

View File

@ -74,6 +74,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
[[autodoc]] AutoImageProcessor
## AutoVideoProcessor
[[autodoc]] AutoVideoProcessor
## AutoProcessor
[[autodoc]] AutoProcessor

View File

@ -237,6 +237,10 @@ for i, output in enumerate(batch_outputs):
[[autodoc]] AyaVisionConfig
## AyaVisionModel
[[autodoc]] AyaVisionModel
## AyaVisionForConditionalGeneration
[[autodoc]] AyaVisionForConditionalGeneration

View File

@ -151,6 +151,12 @@ If you're interested in submitting a resource to be included here, please feel f
- preprocess
- post_process_semantic_segmentation
## BeitImageProcessorFast
[[autodoc]] BeitImageProcessorFast
- preprocess
- post_process_semantic_segmentation
<frameworkcontent>
<pt>

View File

@ -0,0 +1,377 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Csm
## Overview
The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
**Model Architecture:**
CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
</div>
## Usage Tips
### Without Conversational Context
CSM can be used to simply generate speech from a text prompt:
```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
# prepare the inputs
text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
inputs = processor(text, add_special_tokens=True).to(device)
# another equivalent way to prepare the inputs
conversation = [
{"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
# infer the model
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, "example_without_context.wav")
```
### With Conversational Context
CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset, Audio
model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
# prepare the inputs
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# ensure the audio is 24kHz
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
conversation = []
# 1. context
for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
conversation.append(
{
"role": f"{speaker_id}",
"content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
}
)
# 2. text prompt
conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
# infer the model
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, "example_with_context.wav")
```
### Batched Inference
CSM supports batched inference!
```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset, Audio
model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
# prepare the inputs
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# ensure the audio is 24kHz
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
# here a batch with two prompts
conversation = [
[
{
"role": f"{ds[0]['speaker_id']}",
"content": [
{"type": "text", "text": ds[0]["text"]},
{"type": "audio", "path": ds[0]["audio"]["array"]},
],
},
{
"role": f"{ds[1]['speaker_id']}",
"content": [
{"type": "text", "text": ds[1]["text"]},
],
},
],
[
{
"role": f"{ds[0]['speaker_id']}",
"content": [
{"type": "text", "text": ds[0]["text"]},
],
}
],
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
```
### Making The Model Go Brrr
CSM supports full-graph compilation with CUDA graphs!
```python
import torch
import copy
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset
model_id = "eustlb/csm-1b"
device = "cuda"
# set logs to ensure no recompilation and graph breaks
torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
model.generation_config.max_length = 250 # big enough to avoid recompilation
model.generation_config.max_new_tokens = None # would take precedence over max_length
model.generation_config.cache_implementation = "static"
model.depth_decoder.generation_config.cache_implementation = "static"
# generation kwargs
gen_kwargs = {
"do_sample": False,
"depth_decoder_do_sample": False,
"temperature": 1.0,
"depth_decoder_temperature": 1.0,
}
# Define a timing decorator
class TimerContext:
def __init__(self, name="Execution"):
self.name = name
self.start_event = None
self.end_event = None
def __enter__(self):
# Use CUDA events for more accurate GPU timing
self.start_event = torch.cuda.Event(enable_timing=True)
self.end_event = torch.cuda.Event(enable_timing=True)
self.start_event.record()
return self
def __exit__(self, *args):
self.end_event.record()
torch.cuda.synchronize()
elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
print(f"{self.name} time: {elapsed_time:.4f} seconds")
# prepare the inputs
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
conversation = [
{
"role": f"{ds[0]['speaker_id']}",
"content": [
{"type": "text", "text": ds[0]["text"]},
{"type": "audio", "path": ds[0]["audio"]["array"]},
],
},
{
"role": f"{ds[1]['speaker_id']}",
"content": [
{"type": "text", "text": ds[1]["text"]},
{"type": "audio", "path": ds[1]["audio"]["array"]},
],
},
{
"role": f"{ds[2]['speaker_id']}",
"content": [
{"type": "text", "text": ds[2]["text"]},
],
},
]
padded_inputs_1 = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
print("\n" + "="*50)
print("First generation - compiling and recording CUDA graphs...")
with TimerContext("First generation"):
_ = model.generate(**padded_inputs_1, **gen_kwargs)
print("="*50)
print("\n" + "="*50)
print("Second generation - fast !!!")
with TimerContext("Second generation"):
_ = model.generate(**padded_inputs_1, **gen_kwargs)
print("="*50)
# now with different inputs
conversation = [
{
"role": f"{ds[0]['speaker_id']}",
"content": [
{"type": "text", "text": ds[2]["text"]},
{"type": "audio", "path": ds[2]["audio"]["array"]},
],
},
{
"role": f"{ds[1]['speaker_id']}",
"content": [
{"type": "text", "text": ds[3]["text"]},
{"type": "audio", "path": ds[3]["audio"]["array"]},
],
},
{
"role": f"{ds[2]['speaker_id']}",
"content": [
{"type": "text", "text": ds[4]["text"]},
],
},
]
padded_inputs_2 = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
print("\n" + "="*50)
print("Generation with other inputs!")
with TimerContext("Generation with different inputs"):
_ = model.generate(**padded_inputs_2, **gen_kwargs)
print("="*50)
```
### Training
CSM Transformers integration supports training!
```python
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset, Audio
model_id = "eustlb/csm-1b"
device = "cuda"
# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
model.train()
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# ensure the audio is 24kHz
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
conversation = []
# context
for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
conversation.append(
{
"role": f"{speaker_id}",
"content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
}
)
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
output_labels=True,
).to(device)
out = model(**inputs)
out.loss.backward()
```
This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
The original code can be found [here](https://github.com/SesameAILabs/csm).
## CsmConfig
[[autodoc]] CsmConfig
## CsmDepthDecoderConfig
[[autodoc]] CsmDepthDecoderConfig
## CsmProcessor
[[autodoc]] CsmProcessor
- __call__
## CsmForConditionalGeneration
[[autodoc]] CsmForConditionalGeneration
- forward
- generate
## CsmDepthDecoderForCausalLM
[[autodoc]] CsmDepthDecoderForCausalLM
## CsmDepthDecoderModel
[[autodoc]] CsmDepthDecoderModel
## CsmBackboneModel
[[autodoc]] CsmBackboneModel

View File

@ -174,6 +174,10 @@ for i, image in enumerate(images['pixel_values']):
[[autodoc]] Emu3TextModel
- forward
## Emu3Model
[[autodoc]] Emu3Model
## Emu3ForCausalLM
[[autodoc]] Emu3ForCausalLM

View File

@ -103,6 +103,10 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
[[autodoc]] FuyuConfig
## FuyuModel
[[autodoc]] FuyuModel
## FuyuForCausalLM
[[autodoc]] FuyuForCausalLM

View File

@ -254,6 +254,10 @@ visualizer("<img>What is shown in this image?")
[[autodoc]] Gemma3TextModel
- forward
## Gemma3Model
[[autodoc]] Gemma3Model
## Gemma3ForCausalLM
[[autodoc]] Gemma3ForCausalLM

View File

@ -277,6 +277,10 @@ alt="drawing" width="600"/>
[[autodoc]] GotOcr2Processor
## GotOcr2Model
[[autodoc]] GotOcr2Model
## GotOcr2ForConditionalGeneration
[[autodoc]] GotOcr2ForConditionalGeneration

View File

@ -0,0 +1,64 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# GraniteMoeHybrid
## Overview
The `GraniteMoeHybrid` model builds on top of `GraniteMoeSharedModel` and `Bamba`. Its decoding layers consist of state space layers or MoE attention layers with shared experts. By default, the attention layers do not use positional encoding.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model_path = "ibm-granite/granite-4.0-tiny-preview"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# drop device_map if running on CPU
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()
# change input text as desired
prompt = "Write a code to find the maximum value in a list of numbers."
# tokenize the text
input_tokens = tokenizer(prompt, return_tensors="pt")
# generate output tokens
output = model.generate(**input_tokens, max_new_tokens=100)
# decode output tokens into text
output = tokenizer.batch_decode(output)
# loop over the batch to print, in this example the batch size is 1
for i in output:
print(i)
```
This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).
## GraniteMoeHybridConfig
[[autodoc]] GraniteMoeHybridConfig
## GraniteMoeHybridModel
[[autodoc]] GraniteMoeHybridModel
- forward
## GraniteMoeHybridForCausalLM
[[autodoc]] GraniteMoeHybridForCausalLM
- forward

View File

@ -69,6 +69,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipQFormerModel
- forward
## InstructBlipModel
[[autodoc]] InstructBlipModel
## InstructBlipForConditionalGeneration
[[autodoc]] InstructBlipForConditionalGeneration

View File

@ -58,6 +58,12 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipVideoProcessor
## InstructBlipVideoVideoProcessor
[[autodoc]] InstructBlipVideoVideoProcessor
- preprocess
## InstructBlipVideoImageProcessor
[[autodoc]] InstructBlipVideoImageProcessor
@ -73,6 +79,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
[[autodoc]] InstructBlipVideoQFormerModel
- forward
## InstructBlipVideoModel
[[autodoc]] InstructBlipVideoModel
- forward
## InstructBlipVideoForConditionalGeneration
[[autodoc]] InstructBlipVideoForConditionalGeneration

View File

@ -340,6 +340,11 @@ This example showcases how to handle a batch of chat conversations with interlea
[[autodoc]] InternVLVisionModel
- forward
## InternVLModel
[[autodoc]] InternVLModel
- forward
## InternVLForConditionalGeneration
[[autodoc]] InternVLForConditionalGeneration
@ -348,3 +353,7 @@ This example showcases how to handle a batch of chat conversations with interlea
## InternVLProcessor
[[autodoc]] InternVLProcessor
## InternVLVideoProcessor
[[autodoc]] InternVLVideoProcessor

View File

@ -256,6 +256,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] LlavaProcessor
## LlavaModel
[[autodoc]] LlavaModel
## LlavaForConditionalGeneration
[[autodoc]] LlavaForConditionalGeneration

View File

@ -315,6 +315,10 @@ model = AutoModelForImageTextToText.from_pretrained(
[[autodoc]] LlavaNextProcessor
## LlavaNextModel
[[autodoc]] LlavaNextModel
## LlavaNextForConditionalGeneration
[[autodoc]] LlavaNextForConditionalGeneration

View File

@ -262,6 +262,14 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
[[autodoc]] LlavaNextVideoImageProcessor
## LlavaNextVideoVideoProcessor
[[autodoc]] LlavaNextVideoVideoProcessor
## LlavaNextVideoModel
[[autodoc]] LlavaNextVideoModel
## LlavaNextVideoForConditionalGeneration
[[autodoc]] LlavaNextVideoForConditionalGeneration

View File

@ -303,6 +303,7 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
## LlavaOnevisionImageProcessor
[[autodoc]] LlavaOnevisionImageProcessor
- preprocess
## LlavaOnevisionImageProcessorFast
@ -313,6 +314,14 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
[[autodoc]] LlavaOnevisionVideoProcessor
## LlavaOnevisionVideoProcessor
[[autodoc]] LlavaOnevisionVideoProcessor
## LlavaOnevisionModel
[[autodoc]] LlavaOnevisionModel
## LlavaOnevisionForConditionalGeneration
[[autodoc]] LlavaOnevisionForConditionalGeneration

View File

@ -227,6 +227,9 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati
[[autodoc]] Mistral3Config
## Mistral3Model
[[autodoc]] Mistral3Model
## Mistral3ForConditionalGeneration

View File

@ -130,6 +130,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
[[autodoc]] MllamaTextModel
- forward
## MllamaModel
[[autodoc]] MllamaModel
## MllamaForCausalLM
[[autodoc]] MllamaForCausalLM

View File

@ -174,6 +174,10 @@ visualizer("<img> What is in this image?")
[[autodoc]] PaliGemmaProcessor
## PaliGemmaModel
[[autodoc]] PaliGemmaModel
## PaliGemmaForConditionalGeneration
[[autodoc]] PaliGemmaForConditionalGeneration

View File

@ -240,6 +240,10 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
[[autodoc]] Qwen2_5_VLProcessor
## Qwen2_5_VLTextModel
[[autodoc]] Qwen2_5_VLTextModel
- forward
## Qwen2_5_VLModel

View File

@ -287,6 +287,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
[[autodoc]] Qwen2VLImageProcessor
- preprocess
## Qwen2VLVideoProcessor
[[autodoc]] Qwen2VLVideoProcessor
- preprocess
## Qwen2VLImageProcessorFast
[[autodoc]] Qwen2VLImageProcessorFast
@ -296,6 +301,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
[[autodoc]] Qwen2VLProcessor
## Qwen2VLTextModel
[[autodoc]] Qwen2VLTextModel
- forward
## Qwen2VLModel
[[autodoc]] Qwen2VLModel

View File

@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
## Overview
The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer

View File

@ -197,6 +197,9 @@ print(generated_texts[0])
[[autodoc]] SmolVLMImageProcessor
- preprocess
## SmolVLMVideoProcessor
[[autodoc]] SmolVLMVideoProcessor
- preprocess
## SmolVLMProcessor
[[autodoc]] SmolVLMProcessor

View File

@ -50,6 +50,11 @@ A demo Space for image super-resolution with SwinSR can be found [here](https://
[[autodoc]] Swin2SRImageProcessor
- preprocess
## Swin2SRImageProcessorFast
[[autodoc]] Swin2SRImageProcessorFast
- preprocess
## Swin2SRConfig
[[autodoc]] Swin2SRConfig

View File

@ -211,10 +211,19 @@ model = VideoLlavaForConditionalGeneration.from_pretrained(
[[autodoc]] VideoLlavaImageProcessor
## VideoLlavaVideoProcessor
[[autodoc]] VideoLlavaVideoProcessor
## VideoLlavaProcessor
[[autodoc]] VideoLlavaProcessor
## VideoLlavaModel
[[autodoc]] VideoLlavaModel
## VideoLlavaForConditionalGeneration
[[autodoc]] VideoLlavaForConditionalGeneration

View File

@ -101,6 +101,10 @@ A chat between a curious human and an artificial intelligence assistant. The ass
[[autodoc]] VipLlavaConfig
## VipLlavaModel
[[autodoc]] VipLlavaModel
## VipLlavaForConditionalGeneration
[[autodoc]] VipLlavaForConditionalGeneration

View File

@ -44,7 +44,7 @@ Place all inputs on the same device as the model.
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
prompt = "Hello, my llama is cute"
@ -196,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m
input_text = "Hello, my llama is cute"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

View File

@ -51,7 +51,7 @@ By default, vLLM serves the native implementation and if it doesn't exist, it fa
```shell
vllm serve Qwen/Qwen2.5-1.5B-Instruct \
--task generate \
--model-impl transformers \
--model-impl transformers
```
Add the `trust-remote-code` parameter to enable loading a remote code model.
@ -60,5 +60,5 @@ Add the `trust-remote-code` parameter to enable loading a remote code model.
vllm serve Qwen/Qwen2.5-1.5B-Instruct \
--task generate \
--model-impl transformers \
--trust-remote-code \
--trust-remote-code
```

View File

@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
# TorchScript
[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice.
[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.
Transformers can export a model to TorchScript by:

View File

@ -0,0 +1,49 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Video Processor
A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch.
The video processor extends the functionality of image processors by allowing the models to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
Use [`~BaseVideoProcessor.from_pretrained`] to load a video processors configuration (image size, whether to normalize and rescale, etc.) from a video model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model should be saved in a [video_preprocessor_config.json] file but older models might have the config saved in [preprocessor_config.json](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf/blob/main/preprocessor_config.json) file. Note that the latter is less preferred and will be removed in the future.
### Usage Example
Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
```python
from transformers import AutoVideoProcessor
processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
```
Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
```python
import torch
from transformers.video_utils import load_video
from transformers import AutoVideoProcessor
video = load_video("video.mp4")
processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
processor = torch.compile(processor)
processed_video = processor(video, return_tensors="pt")
```

View File

@ -19,7 +19,7 @@ Questa guida si concentra su come addestrare in maniera efficiente grandi modell
## Mixed precision con IPEX
IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.
IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.
Il tipo di dati a bassa precisione BFloat16 è stato supportato in modo nativo su 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) con AVX512 e sarà supportata dalla prossima generazione di Intel® Xeon® Scalable Processors con Intel® Advanced Matrix Extensions (Intel® AMX) instruction set con prestazioni ulteriormente migliorate. L'Auto Mixed Precision per il backende della CPU è stato abilitato da PyTorch-1.10. allo stesso tempo, il supporto di Auto Mixed Precision con BFloat16 per CPU e l'ottimizzazione degli operatori BFloat16 è stata abilitata in modo massiccio in Intel® Extension per PyTorch, and parzialmente aggiornato al branch master di PyTorch. Gli utenti possono ottenere prestazioni migliori ed users experience con IPEX Auto Mixed Precision..

View File

@ -107,6 +107,12 @@ BEiT の使用を開始するのに役立つ公式 Hugging Face およびコミ
- preprocess
- post_process_semantic_segmentation
## BeitImageProcessorFast
[[autodoc]] BeitImageProcessorFast
- preprocess
- post_process_semantic_segmentation
## BeitModel
[[autodoc]] BeitModel

View File

@ -98,7 +98,7 @@
- local: generation_strategies
title: 텍스트 생성 전략 사용자 정의
- local: serving
title: 모델 서빙하기
title: 모델 서빙하기
title: 생성
- isExpanded: false
sections:

View File

@ -20,7 +20,9 @@ line-length = 119
[tool.ruff.lint]
# Never enforce `E501` (line length violations).
ignore = ["C901", "E501", "E741", "F402", "F823" ]
select = ["C", "E", "F", "I", "W"]
# RUF013: Checks for the use of implicit Optional
# in type annotations when the default parameter value is None.
select = ["C", "E", "F", "I", "W", "RUF013"]
# Ignore import violations in all `__init__.py` files.
[tool.ruff.lint.per-file-ignores]

View File

@ -466,7 +466,12 @@ setup(
package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx", "py.typed"]},
zip_safe=False,
extras_require=extras,
entry_points={"console_scripts": ["transformers=transformers.commands.transformers_cli:main", "transformers-cli=transformers.commands.transformers_cli:main_cli"]},
entry_points={
"console_scripts": [
"transformers=transformers.commands.transformers_cli:main",
"transformers-cli=transformers.commands.transformers_cli:main_cli",
]
},
python_requires=">=3.9.0",
install_requires=list(install_requires),
classifiers=[
@ -479,6 +484,9 @@ setup(
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
cmdclass={"deps_table_update": DepsTableUpdateCommand},

View File

@ -276,6 +276,7 @@ _import_structure = {
"TorchAoConfig",
"VptqConfig",
],
"video_utils": [],
}
# tokenizers-backed objects
@ -334,6 +335,7 @@ except OptionalDependencyNotAvailable:
]
else:
_import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
_import_structure["video_processing_utils"] = ["BaseVideoProcessor"]
# PyTorch-backed objects
try:
@ -809,6 +811,7 @@ if TYPE_CHECKING:
from .utils.dummy_torchvision_objects import *
else:
from .image_processing_utils_fast import BaseImageProcessorFast
from .video_processing_utils import BaseVideoProcessor
try:
if not (is_torchvision_available() and is_timm_available()):

View File

@ -24,7 +24,12 @@ from typing import List, Optional, Tuple, Union
import numpy as np
import requests
from .utils import is_librosa_available, requires_backends
from .utils import (
is_librosa_available,
is_numpy_array,
is_torch_tensor,
requires_backends,
)
if is_librosa_available():
@ -69,6 +74,36 @@ AudioInput = Union[
]
def is_valid_audio(audio):
return is_numpy_array(audio) or is_torch_tensor(audio)
def is_valid_list_of_audio(audio):
return audio and all(is_valid_audio(audio_i) for audio_i in audio)
def make_list_of_audio(
audio: Union[list[AudioInput], AudioInput],
) -> AudioInput:
"""
Ensure that the output is a list of audio.
Args:
audio (`Union[List[AudioInput], AudioInput]`):
The input audio.
Returns:
list: A list of audio.
"""
# If it's a list of audios, it's already in the right format
if isinstance(audio, (list, tuple)) and is_valid_list_of_audio(audio):
return audio
# If it's a single audio, convert it to a list of
if is_valid_audio(audio):
return [audio]
raise ValueError("Invalid input type. Must be a single audio or a list of audio")
def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
"""
Convert frequency from hertz to mels.

View File

@ -1402,7 +1402,7 @@ class SlidingWindowCache(StaticCache):
value_states = value_states.to(v_out.dtype)
# assume this only happens in prefill phase when prompt length > sliding_window_size (= max_cache_len)
if cache_position.shape[0] > self.max_cache_len:
if cache_position.shape[0] >= self.max_cache_len:
k_out = key_states[:, :, -self.max_cache_len :, :]
v_out = value_states[:, :, -self.max_cache_len :, :]
# Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly
@ -1413,8 +1413,8 @@ class SlidingWindowCache(StaticCache):
return key_states, value_states
slicing = torch.ones(self.max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0)
cache_position = cache_position.clamp(0, self.max_cache_len - 1)
to_shift = cache_position > self.max_cache_len - 1
cache_position = cache_position.clamp(0, self.max_cache_len - 1)
indices = (slicing + to_shift[-1].int() - 1) % self.max_cache_len
k_out = k_out[:, :, indices]
@ -1725,7 +1725,7 @@ class HybridCache(Cache):
self.value_cache.append(new_layer_value_cache)
def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len):
if cache_position.shape[0] > max_cache_len:
if cache_position.shape[0] >= max_cache_len:
k_out = key_states[:, :, -max_cache_len:, :]
v_out = value_states[:, :, -max_cache_len:, :]
# Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly
@ -1736,8 +1736,8 @@ class HybridCache(Cache):
return key_states, value_states
slicing = torch.ones(max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0)
cache_position = cache_position.clamp(0, max_cache_len - 1)
to_shift = cache_position > max_cache_len - 1
cache_position = cache_position.clamp(0, max_cache_len - 1)
indices = (slicing + to_shift[-1].int() - 1) % max_cache_len
k_out = k_out[:, :, indices]
v_out = v_out[:, :, indices]
@ -2028,6 +2028,13 @@ class OffloadedHybridCache(HybridChunkedCache):
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
):
super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map)
# TODO (joao): to enable this cache on multiple devicesuse the pattern from `OffloadedCache`, which keeps
# track of the original device of each layer
unique_devices = set(layer_device_map.values())
if len(unique_devices) > 1:
raise ValueError(f"OffloadedHybridCache does not support multiple devices. Got devices: {unique_devices}")
self.offload_device = torch.device(offload_device)
# Create new CUDA stream for parallel prefetching.
self._prefetch_stream = torch.cuda.Stream() if torch._C._get_accelerator().type == "cuda" else None
@ -2280,6 +2287,13 @@ class OffloadedStaticCache(StaticCache):
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
) -> None:
super(Cache, self).__init__()
# TODO (joao): to enable this cache on multiple devicesuse the pattern from `OffloadedCache`, which keeps
# track of the original device of each layer
unique_devices = set(layer_device_map.values())
if len(unique_devices) > 1:
raise ValueError(f"OffloadedStaticCache does not support multiple devices. Got devices: {unique_devices}")
self.max_batch_size = max_batch_size
self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
self.device = torch.device(device) if layer_device_map is None else torch.device(layer_device_map[0])

View File

@ -396,9 +396,7 @@ def add_fast_image_processor_file(
content_header = get_fast_image_processing_content_header(content_base_file)
content_base_file = (
f"@add_start_docstrings(\n"
f' "Constructs a fast {fast_image_processor_name.replace("ImageProcessorFast", "")} image processor.",\n'
f" BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,\n)\n"
f"@auto_docstring(\n"
f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
" # This generated class can be used as a starting point for the fast image processor.\n"
" # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n"
@ -422,9 +420,7 @@ def add_fast_image_processor_file(
f'__all__ = ["{fast_image_processor_name}"]\n'
)
imports = (
"\n\nfrom ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast\n"
)
imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n"
image_utils_imports = []
if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"):
image_utils_imports.append("PILImageResampling")
@ -442,7 +438,7 @@ def add_fast_image_processor_file(
image_utils_imports.sort()
imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n"
imports += "from ...utils import add_start_docstrings\n"
imports += "from ...utils import auto_docstring\n"
content = content_header + imports + "\n\n" + content_base_file

View File

@ -13,12 +13,12 @@
# limitations under the License.
import copy
import json
import os
import platform
import string
import time
import warnings
from argparse import ArgumentParser, Namespace
from dataclasses import dataclass, field
from threading import Thread
@ -42,7 +42,13 @@ if is_rich_available():
if is_torch_available():
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
GenerationConfig,
TextIteratorStreamer,
)
ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace)
@ -64,25 +70,16 @@ DEFAULT_EXAMPLES = {
"socks": {"text": "Why is it important to eat socks after meditating?"},
}
SUPPORTED_GENERATION_KWARGS = [
"max_new_tokens",
"do_sample",
"num_beams",
"temperature",
"top_p",
"top_k",
"repetition_penalty",
]
# Printed at the start of a chat session
HELP_STRING_MINIMAL = """
**TRANSFORMERS CHAT INTERFACE**
Chat interface to try out a model. Besides chatting with the model, here are some basic commands:
- **help**: shows all available commands
- **clear**: clears the current conversation and starts a new one
- **exit**: closes the interface
- **!help**: shows all available commands
- **!status**: shows the current status of the model and generation settings
- **!clear**: clears the current conversation and starts a new one
- **!exit**: closes the interface
"""
@ -92,18 +89,32 @@ HELP_STRING = f"""
**TRANSFORMERS CHAT INTERFACE HELP**
Full command list:
- **help**: shows this help message
- **clear**: clears the current conversation and starts a new one
- **example {{NAME}}**: loads example named `{{NAME}}` from the config and uses it as the user input. Available example
names: `{"`, `".join(DEFAULT_EXAMPLES.keys())}`
- **set {{SETTING_NAME}}={{SETTING_VALUE}};**: changes the system prompt or generation settings (multiple settings are
separated by a ';'). Available settings: `{"`, `".join(SUPPORTED_GENERATION_KWARGS)}`
- **reset**: same as clear but also resets the generation configs to defaults if they have been changed by **set**
- **save {{SAVE_NAME}} (optional)**: saves the current chat and settings to file by default to
- **!help**: shows this help message
- **!clear**: clears the current conversation and starts a new one
- **!status**: shows the current status of the model and generation settings
- **!example {{NAME}}**: loads example named `{{NAME}}` from the config and uses it as the user input.
Available example names: `{"`, `".join(DEFAULT_EXAMPLES.keys())}`
- **!set {{ARG_1}}={{VALUE_1}} {{ARG_2}}={{VALUE_2}}** ...: changes the system prompt or generation settings (multiple
settings are separated by a space). Accepts the same flags and format as the `generate_flags` CLI argument.
If you're a new user, check this basic flag guide: https://huggingface.co/docs/transformers/llm_tutorial#common-options
- **!save {{SAVE_NAME}} (optional)**: saves the current chat and settings to file by default to
`./chat_history/{{MODEL_NAME}}/chat_{{DATETIME}}.yaml` or `{{SAVE_NAME}}` if provided
- **exit**: closes the interface
- **!exit**: closes the interface
"""
# format: (optional CLI arg being deprecated, its current default, corresponding `generate` flag)
_DEPRECATION_MAP = [
("max_new_tokens", 256, "max_new_tokens"),
("do_sample", True, "do_sample"),
("num_beams", 1, "num_beams"),
("temperature", 1.0, "temperature"),
("top_k", 50, "top_k"),
("top_p", 1.0, "top_p"),
("repetition_penalty", 1.0, "repetition_penalty"),
("eos_tokens", None, "eos_token_id"),
("eos_token_ids", None, "eos_token_id"),
]
class RichInterface:
def __init__(self, model_name: Optional[str] = None, user_name: Optional[str] = None):
@ -181,6 +192,14 @@ class RichInterface:
self._console.print(Markdown(HELP_STRING_MINIMAL if minimal else HELP_STRING))
self._console.print()
def print_status(self, model_name: str, generation_config: GenerationConfig, model_kwargs: dict):
"""Prints the status of the model and generation settings to the console."""
self._console.print(f"[bold blue]Model: {model_name}\n")
if model_kwargs:
self._console.print(f"[bold blue]Model kwargs: {model_kwargs}")
self._console.print(f"[bold blue]{generation_config}")
self._console.print()
@dataclass
class ChatArguments:
@ -207,6 +226,17 @@ class ChatArguments:
examples_path: Optional[str] = field(default=None, metadata={"help": "Path to a yaml file with examples."})
# Generation settings
generation_config: Optional[str] = field(
default=None,
metadata={
"help": (
"Path to a local generation config file or to a HuggingFace repo containing a "
"`generation_config.json` file. Other generation settings passed as CLI arguments will be applied on "
"top of this generation config."
),
},
)
# Deprecated CLI args start here
max_new_tokens: int = field(default=256, metadata={"help": "Maximum number of tokens to generate."})
do_sample: bool = field(default=True, metadata={"help": "Whether to sample outputs during generation."})
num_beams: int = field(default=1, metadata={"help": "Number of beams for beam search."})
@ -222,6 +252,7 @@ class ChatArguments:
default=None,
metadata={"help": "EOS token IDs to stop the generation. If multiple they should be comma separated."},
)
# Deprecated CLI args end here
# Model loading
model_revision: str = field(
@ -280,23 +311,66 @@ class ChatCommand(BaseTransformersCLICommand):
group = chat_parser.add_argument_group("Positional arguments")
group.add_argument(
"model_name_or_path_positional", type=str, nargs="?", default=None, help="Name of the pre-trained model."
"model_name_or_path_positional", type=str, default=None, help="Name of the pre-trained model."
)
group.add_argument(
"generate_flags",
type=str,
default=None,
help=(
"Flags to pass to `generate`, using a space as a separator between flags. Accepts booleans, numbers, "
"and lists of integers, more advanced parameterization should be set through --generation-config. "
"Example: `transformers chat <model_repo> max_new_tokens=100 do_sample=False eos_token_id=[1,2]`. "
"If you're a new user, check this basic flag guide: https://huggingface.co/docs/transformers/llm_tutorial#common-options"
),
nargs="*",
)
chat_parser.set_defaults(func=chat_command_factory)
def __init__(self, args):
args.model_name_or_path = args.model_name_or_path_positional or args.model_name_or_path
args = self._handle_deprecated_args(args)
self.args = args
if args.model_name_or_path is None:
def _handle_deprecated_args(self, args: ChatArguments) -> ChatArguments:
"""
Handles deprecated arguments and their deprecation cycle. To be removed after we fully migrated to the new
args.
"""
has_warnings = False
# 1. Model as a positional argument
args.model_name_or_path_positional = args.model_name_or_path_positional or args.model_name_or_path
if args.model_name_or_path_positional is None:
raise ValueError(
"One of the following must be provided:"
"\n- The positional argument containing the model repo;"
"\n- the optional --model_name_or_path argument, containing the model repo"
"\ne.g. transformers chat <model_repo> or transformers chat --model_name_or_path <model_repo>"
"\n- The positional argument containing the model repo, e.g. `transformers chat <model_repo>`"
"\n- the optional --model_name_or_path argument, containing the model repo (deprecated)"
)
elif args.model_name_or_path is not None:
has_warnings = True
warnings.warn(
"The --model_name_or_path argument is deprecated will be removed in v4.54.0. Use the positional "
"argument instead, e.g. `transformers chat <model_repo>`.",
FutureWarning,
)
# 2. Named generate option args
for deprecated_arg, default_value, new_arg in _DEPRECATION_MAP:
value = getattr(args, deprecated_arg)
if value != default_value:
has_warnings = True
warnings.warn(
f"The --{deprecated_arg} argument is deprecated will be removed in v4.54.0. There are two "
"alternative solutions to specify this generation option: \n"
"1. Pass `--generation-config <path_to_file/Hub repo>` to specify a generation config.\n"
"2. Pass `generate` flags through positional arguments, e.g. `transformers chat <model_repo> "
f"{new_arg}={value}`",
FutureWarning,
)
self.args = args
if has_warnings:
print("\n(Press enter to continue)")
input()
return args
# -----------------------------------------------------------------------------------------------------------------
# Chat session methods
@ -319,7 +393,7 @@ class ChatCommand(BaseTransformersCLICommand):
if filename is None:
time_str = time.strftime("%Y-%m-%d_%H-%M-%S")
filename = f"{args.model_name_or_path}/chat_{time_str}.json"
filename = f"{args.model_name_or_path_positional}/chat_{time_str}.json"
filename = os.path.join(folder, filename)
os.makedirs(os.path.dirname(filename), exist_ok=True)
@ -338,50 +412,95 @@ class ChatCommand(BaseTransformersCLICommand):
# -----------------------------------------------------------------------------------------------------------------
# Input parsing methods
@staticmethod
def parse_settings(
user_input: str, current_args: ChatArguments, interface: RichInterface
) -> tuple[ChatArguments, bool]:
"""Parses the settings from the user input into the CLI arguments."""
settings = user_input[4:].strip().split(";")
settings = [(setting.split("=")[0], setting[len(setting.split("=")[0]) + 1 :]) for setting in settings]
settings = dict(settings)
error = False
def parse_generate_flags(self, generate_flags: list[str]) -> dict:
"""Parses the generate flags from the user input into a dictionary of `generate` kwargs."""
if len(generate_flags) == 0:
return {}
for name in settings:
if hasattr(current_args, name):
try:
if isinstance(getattr(current_args, name), bool):
if settings[name] == "True":
settings[name] = True
elif settings[name] == "False":
settings[name] = False
else:
raise ValueError
else:
settings[name] = type(getattr(current_args, name))(settings[name])
except ValueError:
error = True
interface.print_color(
text=f"Cannot cast setting {name} (={settings[name]}) to {type(getattr(current_args, name))}.",
color="red",
)
else:
interface.print_color(text=f"There is no '{name}' setting.", color="red")
# Assumption: `generate_flags` is a list of strings, each string being a `flag=value` pair, that can be parsed
# into a json string if we:
# 1. Add quotes around each flag name
generate_flags_as_dict = {'"' + flag.split("=")[0] + '"': flag.split("=")[1] for flag in generate_flags}
if error:
interface.print_color(
text="There was an issue parsing the settings. No settings have been changed.",
color="red",
# 2. Handle types:
# 2. a. booleans should be lowercase, None should be null
generate_flags_as_dict = {
k: v.lower() if v.lower() in ["true", "false"] else v for k, v in generate_flags_as_dict.items()
}
generate_flags_as_dict = {k: "null" if v == "None" else v for k, v in generate_flags_as_dict.items()}
# 2. b. strings should be quoted
def is_number(s: str) -> bool:
return s.replace(".", "", 1).isdigit()
generate_flags_as_dict = {k: f'"{v}"' if not is_number(v) else v for k, v in generate_flags_as_dict.items()}
# 2. c. [no processing needed] lists are lists of ints because `generate` doesn't take lists of strings :)
# We also mention in the help message that we only accept lists of ints for now.
# 3. Join the the result into a comma separated string
generate_flags_string = ", ".join([f"{k}: {v}" for k, v in generate_flags_as_dict.items()])
# 4. Add the opening/closing brackets
generate_flags_string = "{" + generate_flags_string + "}"
# 5. Remove quotes around boolean/null and around lists
generate_flags_string = generate_flags_string.replace('"null"', "null")
generate_flags_string = generate_flags_string.replace('"true"', "true")
generate_flags_string = generate_flags_string.replace('"false"', "false")
generate_flags_string = generate_flags_string.replace('"[', "[")
generate_flags_string = generate_flags_string.replace(']"', "]")
# 6. Replace the `=` with `:`
generate_flags_string = generate_flags_string.replace("=", ":")
try:
processed_generate_flags = json.loads(generate_flags_string)
except json.JSONDecodeError:
raise ValueError(
"Failed to convert `generate_flags` into a valid JSON object."
"\n`generate_flags` = {generate_flags}"
"\nConverted JSON string = {generate_flags_string}"
)
return processed_generate_flags
def get_generation_parameterization(
self, args: ChatArguments, tokenizer: AutoTokenizer
) -> tuple[GenerationConfig, dict]:
"""
Returns a GenerationConfig object holding the generation parameters for the CLI command.
"""
# No generation config arg provided -> use base generation config, apply CLI defaults
if args.generation_config is None:
generation_config = GenerationConfig()
# Apply deprecated CLI args on top of the default generation config
pad_token_id, eos_token_ids = self.parse_eos_tokens(tokenizer, args.eos_tokens, args.eos_token_ids)
deprecated_kwargs = {
"max_new_tokens": args.max_new_tokens,
"do_sample": args.do_sample,
"num_beams": args.num_beams,
"temperature": args.temperature,
"top_k": args.top_k,
"top_p": args.top_p,
"repetition_penalty": args.repetition_penalty,
"pad_token_id": pad_token_id,
"eos_token_id": eos_token_ids,
}
generation_config.update(**deprecated_kwargs)
# generation config arg provided -> use it as the base parameterization
else:
for name in settings:
setattr(current_args, name, settings[name])
interface.print_color(text=f"Set {name} to {settings[name]}.", color="green")
if ".json" in args.generation_config: # is a local file
dirname = os.path.dirname(args.generation_config)
filename = os.path.basename(args.generation_config)
generation_config = GenerationConfig.from_pretrained(dirname, filename)
else:
generation_config = GenerationConfig.from_pretrained(args.generation_config)
time.sleep(1.5) # so the user has time to read the changes
return current_args, not error
# Finally: parse and apply `generate_flags`
parsed_generate_flags = self.parse_generate_flags(args.generate_flags)
model_kwargs = generation_config.update(**parsed_generate_flags)
# `model_kwargs` contain non-generation flags in `parsed_generate_flags` that should be passed directly to
# `generate`
return generation_config, model_kwargs
@staticmethod
def parse_eos_tokens(
@ -406,36 +525,6 @@ class ChatCommand(BaseTransformersCLICommand):
return pad_token_id, all_eos_token_ids
@staticmethod
def is_valid_setting_command(s: str) -> bool:
# First check the basic structure
if not s.startswith("set ") or "=" not in s:
return False
# Split into individual assignments
assignments = [a.strip() for a in s[4:].split(";") if a.strip()]
for assignment in assignments:
# Each assignment should have exactly one '='
if assignment.count("=") != 1:
return False
key, value = assignment.split("=", 1)
key = key.strip()
value = value.strip()
if not key or not value:
return False
# Keys can only have alphabetic characters, spaces and underscores
if not set(key).issubset(ALLOWED_KEY_CHARS):
return False
# Values can have just about anything that isn't a semicolon
if not set(value).issubset(ALLOWED_VALUE_CHARS):
return False
return True
# -----------------------------------------------------------------------------------------------------------------
# Model loading and performance automation methods
@staticmethod
@ -460,7 +549,7 @@ class ChatCommand(BaseTransformersCLICommand):
def load_model_and_tokenizer(self, args: ChatArguments) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
args.model_name_or_path_positional,
revision=args.model_revision,
trust_remote_code=args.trust_remote_code,
)
@ -475,7 +564,7 @@ class ChatCommand(BaseTransformersCLICommand):
"quantization_config": quantization_config,
}
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path, trust_remote_code=args.trust_remote_code, **model_kwargs
args.model_name_or_path_positional, trust_remote_code=args.trust_remote_code, **model_kwargs
)
if getattr(model, "hf_device_map", None) is None:
@ -483,6 +572,88 @@ class ChatCommand(BaseTransformersCLICommand):
return model, tokenizer
# -----------------------------------------------------------------------------------------------------------------
# User commands
def handle_non_exit_user_commands(
self,
user_input: str,
args: ChatArguments,
interface: RichInterface,
examples: dict[str, dict[str, str]],
generation_config: GenerationConfig,
model_kwargs: dict,
chat: list[dict],
) -> tuple[list[dict], GenerationConfig, dict]:
"""
Handles all user commands except for `!exit`. May update the chat history (e.g. reset it) or the
generation config (e.g. set a new flag).
"""
if user_input == "!clear":
chat = self.clear_chat_history(args.system_prompt)
interface.clear()
elif user_input == "!help":
interface.print_help()
elif user_input.startswith("!save") and len(user_input.split()) < 2:
split_input = user_input.split()
if len(split_input) == 2:
filename = split_input[1]
else:
filename = None
filename = self.save_chat(chat, args, filename)
interface.print_color(text=f"Chat saved in {filename}!", color="green")
elif user_input.startswith("!set"):
# splits the new args into a list of strings, each string being a `flag=value` pair (same format as
# `generate_flags`)
new_generate_flags = user_input[4:].strip()
new_generate_flags = new_generate_flags.split()
# sanity check: each member in the list must have an =
for flag in new_generate_flags:
if "=" not in flag:
interface.print_color(
text=(
f"Invalid flag format, missing `=` after `{flag}`. Please use the format "
"`arg_1=value_1 arg_2=value_2 ...`."
),
color="red",
)
break
else:
# parses the new args into a dictionary of `generate` kwargs, and updates the corresponding variables
parsed_new_generate_flags = self.parse_generate_flags(new_generate_flags)
new_model_kwargs = generation_config.update(**parsed_new_generate_flags)
model_kwargs.update(**new_model_kwargs)
elif user_input.startswith("!example") and len(user_input.split()) == 2:
example_name = user_input.split()[1]
if example_name in examples:
interface.clear()
chat = []
interface.print_user_message(examples[example_name]["text"])
chat.append({"role": "user", "content": examples[example_name]["text"]})
else:
example_error = (
f"Example {example_name} not found in list of available examples: {list(examples.keys())}."
)
interface.print_color(text=example_error, color="red")
elif user_input == "!status":
interface.print_status(
model_name=args.model_name_or_path_positional,
generation_config=generation_config,
model_kwargs=model_kwargs,
)
else:
interface.print_color(text=f"'{user_input}' is not a valid command. Showing help message.", color="red")
interface.print_help()
return chat, generation_config, model_kwargs
# -----------------------------------------------------------------------------------------------------------------
# Main logic
def run(self):
@ -498,8 +669,6 @@ class ChatCommand(BaseTransformersCLICommand):
with open(args.examples_path) as f:
examples = yaml.safe_load(f)
current_args = copy.deepcopy(args)
if args.user is None:
user = self.get_username()
else:
@ -507,12 +676,11 @@ class ChatCommand(BaseTransformersCLICommand):
model, tokenizer = self.load_model_and_tokenizer(args)
generation_streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
generation_config, model_kwargs = self.get_generation_parameterization(args, tokenizer)
pad_token_id, eos_token_ids = self.parse_eos_tokens(tokenizer, args.eos_tokens, args.eos_token_ids)
interface = RichInterface(model_name=args.model_name_or_path, user_name=user)
interface = RichInterface(model_name=args.model_name_or_path_positional, user_name=user)
interface.clear()
chat = self.clear_chat_history(current_args.system_prompt)
chat = self.clear_chat_history(args.system_prompt)
# Starts the session with a minimal help message at the top, so that a user doesn't get stuck
interface.print_help(minimal=True)
@ -520,57 +688,26 @@ class ChatCommand(BaseTransformersCLICommand):
try:
user_input = interface.input()
if user_input == "clear":
chat = self.clear_chat_history(current_args.system_prompt)
interface.clear()
continue
if user_input == "help":
interface.print_help()
continue
if user_input == "exit":
break
if user_input == "reset":
interface.clear()
current_args = copy.deepcopy(args)
chat = self.clear_chat_history(current_args.system_prompt)
continue
if user_input.startswith("save") and len(user_input.split()) < 2:
split_input = user_input.split()
if len(split_input) == 2:
filename = split_input[1]
# User commands
if user_input.startswith("!"):
# `!exit` is special, it breaks the loop
if user_input == "!exit":
break
else:
filename = None
filename = self.save_chat(chat, current_args, filename)
interface.print_color(text=f"Chat saved in {filename}!", color="green")
continue
if self.is_valid_setting_command(user_input):
current_args, success = self.parse_settings(user_input, current_args, interface)
if success:
chat = []
interface.clear()
continue
if user_input.startswith("example") and len(user_input.split()) == 2:
example_name = user_input.split()[1]
if example_name in examples:
interface.clear()
chat = []
interface.print_user_message(examples[example_name]["text"])
user_input = examples[example_name]["text"]
else:
example_error = (
f"Example {example_name} not found in list of available examples: {list(examples.keys())}."
chat, generation_config, model_kwargs = self.handle_non_exit_user_commands(
user_input=user_input,
args=args,
interface=interface,
examples=examples,
generation_config=generation_config,
model_kwargs=model_kwargs,
chat=chat,
)
interface.print_color(text=example_error, color="red")
# `!example` sends a user message to the model
if not user_input.startswith("!example"):
continue
chat.append({"role": "user", "content": user_input})
else:
chat.append({"role": "user", "content": user_input})
inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
model.device
@ -580,15 +717,8 @@ class ChatCommand(BaseTransformersCLICommand):
"inputs": inputs,
"attention_mask": attention_mask,
"streamer": generation_streamer,
"max_new_tokens": current_args.max_new_tokens,
"do_sample": current_args.do_sample,
"num_beams": current_args.num_beams,
"temperature": current_args.temperature,
"top_k": current_args.top_k,
"top_p": current_args.top_p,
"repetition_penalty": current_args.repetition_penalty,
"pad_token_id": pad_token_id,
"eos_token_id": eos_token_ids,
"generation_config": generation_config,
**model_kwargs,
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)

View File

@ -833,6 +833,7 @@ class PretrainedConfig(PushToHubMixin):
if "model_type" in value:
# Needs to be set even if it's not in the diff
diff["model_type"] = value["model_type"]
serializable_config_dict[key] = diff
elif (
key not in default_config_dict
@ -1003,6 +1004,8 @@ class PretrainedConfig(PushToHubMixin):
del d["_commit_hash"]
if "_attn_implementation_internal" in d:
del d["_attn_implementation_internal"]
if "_attn_implementation_autoset" in d:
del d["_attn_implementation_autoset"]
# Do not serialize `base_model_tp_plan` for now
if "base_model_tp_plan" in d:
del d["base_model_tp_plan"]

View File

@ -73,7 +73,7 @@ class MaxLengthCriteria(StoppingCriteria):
@add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
cur_len = input_ids.shape[-1]
cur_len = input_ids.shape[1]
is_done = cur_len >= self.max_length
if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
logger.warning_once(

View File

@ -563,7 +563,7 @@ class GenerationMixin:
if model_inputs["inputs_embeds"] is not None:
batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
else:
batch_size, sequence_length = model_inputs[input_ids_key].shape
batch_size, sequence_length = model_inputs[input_ids_key].shape[:2]
# Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
# the 4D causal mask exists, it should be present in the base model (XXXModel class) or in its decoder.
@ -1708,9 +1708,11 @@ class GenerationMixin:
return generation_config, model_kwargs
def _get_initial_cache_position(self, input_ids, model_kwargs):
def _get_initial_cache_position(self, seq_length, device, model_kwargs):
"""Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
# `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
if "cache_position" in model_kwargs and model_kwargs["cache_position"]:
return model_kwargs
if "inputs_embeds" in model_kwargs and not self.config.is_encoder_decoder:
cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
elif "decoder_inputs_embeds" in model_kwargs and self.config.is_encoder_decoder:
@ -1718,7 +1720,7 @@ class GenerationMixin:
torch.ones_like(model_kwargs["decoder_inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
)
else:
cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1
cache_position = torch.ones(seq_length, dtype=torch.int64, device=device).cumsum(0) - 1
past_length = 0
if model_kwargs.get("past_key_values") is not None:
@ -2332,7 +2334,7 @@ class GenerationMixin:
streamer.put(input_ids.cpu())
# 6. Prepare `max_length` depending on other stopping criteria.
input_ids_length = input_ids.shape[-1]
input_ids_length = input_ids.shape[1]
has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
generation_config = self._prepare_generated_length(
@ -2805,9 +2807,9 @@ class GenerationMixin:
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
# keep track of which sequences are already finished
batch_size = input_ids.shape[0]
batch_size, cur_length = input_ids.shape[:2]
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_length, input_ids.device, model_kwargs)
this_peer_finished = False
@ -3016,9 +3018,9 @@ class GenerationMixin:
)
# keep track of which sequences are already finished
batch_size = input_ids.shape[0]
batch_size, cur_len = input_ids.shape[:2]
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
# Create cosine_matrix_mask based on the attention_mask
cosine_matrix_mask = torch.ones_like(input_ids, dtype=torch.long)
@ -3428,10 +3430,10 @@ class GenerationMixin:
)
# keep track of which sequences are already finished
batch_size, cur_len = input_ids.shape
batch_size, cur_len = input_ids.shape[:2]
this_peer_finished = False
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
model_forward = self.__call__
compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
@ -3834,7 +3836,7 @@ class GenerationMixin:
num_beams = generation_config.num_beams
num_return_sequences = generation_config.num_return_sequences
batch_size_unflattened, cur_len = input_ids.shape
batch_size_unflattened, cur_len = input_ids.shape[:2]
batch_size = batch_size_unflattened // num_beams
# TODO (joao): standardize special cases
if self.__class__.__name__ == "MoshiDepthDecoder":
@ -3857,7 +3859,7 @@ class GenerationMixin:
dim=0,
).to(input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
# (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
# are newer low-memory alternatives like the offloaded cache)
@ -4156,7 +4158,7 @@ class GenerationMixin:
device = input_ids.device
batch_beam_size, cur_len = input_ids.shape
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
if return_dict_in_generate and output_scores:
beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)]
@ -4190,7 +4192,7 @@ class GenerationMixin:
this_peer_finished = False
decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder
decoder_prompt_len = input_ids.shape[1] # record the prompt length of decoder
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
# predicted tokens in cur_len step
current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
@ -4444,8 +4446,8 @@ class GenerationMixin:
batch_size = len(constrained_beam_scorer._beam_hyps)
num_beams = constrained_beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
batch_beam_size, cur_len = input_ids.shape[:2]
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
if num_beams * batch_size != batch_beam_size:
raise ValueError(
@ -4477,7 +4479,7 @@ class GenerationMixin:
this_peer_finished = False
decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder
decoder_prompt_len = input_ids.shape[1] # record the prompt length of decoder
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
@ -4698,14 +4700,14 @@ class GenerationMixin:
)
# keep track of which sequences are already finished
batch_size = input_ids.shape[0]
batch_size, cur_len = input_ids.shape[:2]
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
this_peer_finished = False
is_first_iteration = True # to preserve the same API in the output as other generation methods
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
cur_len = input_ids.shape[-1]
cur_len = input_ids.shape[1]
# 1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device
candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
@ -4795,7 +4797,7 @@ class GenerationMixin:
input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
if streamer is not None:
streamer.put(valid_tokens.cpu())
new_cur_len = input_ids.shape[-1]
new_cur_len = input_ids.shape[1]
# 4.2. Discard past key values relative to unused assistant tokens
new_cache_size = new_cur_len - 1

View File

@ -46,7 +46,7 @@ from .image_utils import (
from .processing_utils import Unpack
from .utils import (
TensorType,
add_start_docstrings,
auto_docstring,
is_torch_available,
is_torchvision_available,
is_torchvision_v2_available,
@ -190,107 +190,7 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False):
device: Optional["torch.device"]
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
Args:
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `self.size`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
Whether to default to a square image when resizing, if size is an int.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
overridden by the `resample` parameter in the `preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
`preprocess` method.
crop_size (`Dict[str, int]` *optional*, defaults to `self.crop_size`):
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
overridden by the `rescale_factor` parameter in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images."""
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Describes the maximum input dimensions to the model.
resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the output image after applying `center_crop`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images."""
@add_start_docstrings(
"Constructs a fast base image processor.",
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
)
@auto_docstring
class BaseImageProcessorFast(BaseImageProcessor):
resample = None
image_mean = None
@ -349,7 +249,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
Image to resize.
size (`SizeDict`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
Returns:
@ -666,7 +566,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
data_format=data_format,
)
@add_start_docstrings(BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS)
@auto_docstring
def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
# Set default kwargs from self. This ensures that if a kwarg is not provided

View File

@ -56,7 +56,9 @@ def to_channel_dimension_format(
input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
) -> np.ndarray:
"""
Converts `image` to the channel dimension format specified by `channel_dim`.
Converts `image` to the channel dimension format specified by `channel_dim`. The input
can have arbitrary number of leading dimensions. Only last three dimension will be permuted
to format the `image`.
Args:
image (`numpy.ndarray`):
@ -80,9 +82,11 @@ def to_channel_dimension_format(
return image
if target_channel_dim == ChannelDimension.FIRST:
image = image.transpose((2, 0, 1))
axes = list(range(image.ndim - 3)) + [image.ndim - 1, image.ndim - 3, image.ndim - 2]
image = image.transpose(axes)
elif target_channel_dim == ChannelDimension.LAST:
image = image.transpose((1, 2, 0))
axes = list(range(image.ndim - 3)) + [image.ndim - 2, image.ndim - 1, image.ndim - 3]
image = image.transpose(axes)
else:
raise ValueError(f"Unsupported channel dimension format: {channel_dim}")
@ -751,7 +755,7 @@ def pad(
values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
# Add additional padding if there's a batch dimension
values = (0, *values) if image.ndim == 4 else values
values = ((0, 0), *values) if image.ndim == 4 else values
return values
padding = _expand_for_data_format(padding)

View File

@ -15,11 +15,9 @@
import base64
import os
from collections.abc import Iterable
from contextlib import redirect_stdout
from dataclasses import dataclass
from io import BytesIO
from typing import Callable, Optional, Union
from urllib.parse import urlparse
from typing import Optional, Union
import numpy as np
import requests
@ -27,9 +25,6 @@ from packaging import version
from .utils import (
ExplicitEnum,
is_av_available,
is_cv2_available,
is_decord_available,
is_jax_tensor,
is_numpy_array,
is_tf_tensor,
@ -37,7 +32,6 @@ from .utils import (
is_torch_tensor,
is_torchvision_available,
is_vision_available,
is_yt_dlp_available,
logging,
requires_backends,
to_numpy,
@ -62,7 +56,6 @@ if is_vision_available():
PILImageResampling = PIL.Image
if is_torchvision_available():
from torchvision import io as torchvision_io
from torchvision.transforms import InterpolationMode
pil_torch_interpolation_mapping = {
@ -89,18 +82,6 @@ ImageInput = Union[
] # noqa
VideoInput = Union[
list["PIL.Image.Image"],
"np.ndarray",
"torch.Tensor",
list["np.ndarray"],
list["torch.Tensor"],
list[list["PIL.Image.Image"]],
list[list["np.ndarray"]],
list[list["torch.Tensor"]],
] # noqa
class ChannelDimension(ExplicitEnum):
FIRST = "channels_first"
LAST = "channels_last"
@ -116,14 +97,6 @@ class AnnotionFormat(ExplicitEnum):
COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
@dataclass
class VideoMetadata:
total_num_frames: int
fps: float
duration: float
video_backend: str
AnnotationType = dict[str, Union[int, str, list[dict]]]
@ -309,37 +282,6 @@ def make_nested_list_of_images(
raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.")
def make_batched_videos(videos) -> VideoInput:
"""
Ensure that the input is a list of videos.
Args:
videos (`VideoInput`):
Video or videos to turn into a list of videos.
Returns:
list: A list of videos.
"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
# case 1: nested batch of videos so we flatten it
if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
videos = [[video for batch_list in batched_videos for video in batch_list] for batched_videos in videos]
# case 2: list of videos represented as list of video frames
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if is_pil_image(videos[0]) or videos[0].ndim == 3:
return [videos]
elif videos[0].ndim == 4:
return [list(video) for video in videos]
elif is_valid_image(videos):
if is_pil_image(videos) or videos.ndim == 3:
return [[videos]]
elif videos.ndim == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
def to_numpy_array(img) -> np.ndarray:
if not is_valid_image(img):
raise ValueError(f"Invalid image type: {type(img)}")
@ -371,6 +313,8 @@ def infer_channel_dimension_format(
first_dim, last_dim = 0, 2
elif image.ndim == 4:
first_dim, last_dim = 1, 3
elif image.ndim == 5:
first_dim, last_dim = 2, 4
else:
raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
@ -548,348 +492,6 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
return image
def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None, **kwargs):
"""
A default sampling function that replicates the logic used in get_uniform_frame_indices,
while optionally handling `fps` if `num_frames` is not provided.
Args:
metadata (`VideoMetadata`):
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
num_frames (`int`, *optional*):
Number of frames to sample uniformly.
fps (`int`, *optional*):
Desired frames per second. Takes priority over num_frames if both are provided.
Returns:
`np.ndarray`: Array of frame indices to sample.
"""
total_num_frames = metadata.total_num_frames
video_fps = metadata.fps
# If num_frames is not given but fps is, calculate num_frames from fps
if num_frames is None and fps is not None:
num_frames = int(total_num_frames / video_fps * fps)
if num_frames > total_num_frames:
raise ValueError(
f"When loading the video with fps={fps}, we computed num_frames={num_frames} "
f"which exceeds total_num_frames={total_num_frames}. Check fps or video metadata."
)
if num_frames is not None:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
else:
indices = np.arange(0, total_num_frames, dtype=int)
return indices
def read_video_opencv(
video_path: str,
sample_indices_fn: Callable,
**kwargs,
):
"""
Decode a video using the OpenCV backend.
Args:
video_path (`str`):
Path to the video file.
sample_indices_fn (`Callable`):
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
If not provided, simple uniform sampling with fps is performed.
Example:
def sample_indices_fn(metadata, **kwargs):
return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
Returns:
Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import cv2
requires_backends(read_video_opencv, ["cv2"])
import cv2
video = cv2.VideoCapture(video_path)
total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
video_fps = video.get(cv2.CAP_PROP_FPS)
duration = total_num_frames / video_fps if video_fps else 0
metadata = VideoMetadata(
total_num_frames=int(total_num_frames), fps=float(video_fps), duration=float(duration), video_backend="opencv"
)
indices = sample_indices_fn(metadata=metadata, **kwargs)
index = 0
frames = []
while video.isOpened():
success, frame = video.read()
if not success:
break
if index in indices:
height, width, channel = frame.shape
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(frame[0:height, 0:width, 0:channel])
if success:
index += 1
if index >= total_num_frames:
break
video.release()
metadata.frames_indices = indices
return np.stack(frames), metadata
def read_video_decord(
video_path: str,
sample_indices_fn: Optional[Callable] = None,
**kwargs,
):
"""
Decode a video using the Decord backend.
Args:
video_path (`str`):
Path to the video file.
sample_indices_fn (`Callable`, *optional*):
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
If not provided, simple uniform sampling with fps is performed.
Example:
def sample_indices_fn(metadata, **kwargs):
return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
Returns:
Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import from decord
requires_backends(read_video_decord, ["decord"])
from decord import VideoReader, cpu
vr = VideoReader(uri=video_path, ctx=cpu(0)) # decord has problems with gpu
video_fps = vr.get_avg_fps()
total_num_frames = len(vr)
duration = total_num_frames / video_fps if video_fps else 0
metadata = VideoMetadata(
total_num_frames=int(total_num_frames), fps=float(video_fps), duration=float(duration), video_backend="decord"
)
indices = sample_indices_fn(metadata=metadata, **kwargs)
frames = vr.get_batch(indices).asnumpy()
metadata.frames_indices = indices
return frames, metadata
def read_video_pyav(
video_path: str,
sample_indices_fn: Callable,
**kwargs,
):
"""
Decode the video with PyAV decoder.
Args:
video_path (`str`):
Path to the video file.
sample_indices_fn (`Callable`, *optional*):
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
If not provided, simple uniform sampling with fps is performed.
Example:
def sample_indices_fn(metadata, **kwargs):
return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
Returns:
Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import av
requires_backends(read_video_pyav, ["av"])
import av
container = av.open(video_path)
total_num_frames = container.streams.video[0].frames
video_fps = container.streams.video[0].average_rate # should we better use `av_guess_frame_rate`?
duration = total_num_frames / video_fps if video_fps else 0
metadata = VideoMetadata(
total_num_frames=int(total_num_frames), fps=float(video_fps), duration=float(duration), video_backend="pyav"
)
indices = sample_indices_fn(metadata=metadata, **kwargs)
frames = []
container.seek(0)
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= 0 and i in indices:
frames.append(frame)
video = np.stack([x.to_ndarray(format="rgb24") for x in frames])
metadata.frames_indices = indices
return video, metadata
def read_video_torchvision(
video_path: str,
sample_indices_fn: Callable,
**kwargs,
):
"""
Decode the video with torchvision decoder.
Args:
video_path (`str`):
Path to the video file.
sample_indices_fn (`Callable`, *optional*):
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
If not provided, simple uniform sampling with fps is performed.
Example:
def sample_indices_fn(metadata, **kwargs):
return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
Returns:
Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
video, _, info = torchvision_io.read_video(
video_path,
start_pts=0.0,
end_pts=None,
pts_unit="sec",
output_format="THWC",
)
video_fps = info["video_fps"]
total_num_frames = video.size(0)
duration = total_num_frames / video_fps if video_fps else 0
metadata = VideoMetadata(
total_num_frames=int(total_num_frames),
fps=float(video_fps),
duration=float(duration),
video_backend="torchvision",
)
indices = sample_indices_fn(metadata=metadata, **kwargs)
video = video[indices].contiguous().numpy()
metadata.frames_indices = indices
return video, metadata
VIDEO_DECODERS = {
"decord": read_video_decord,
"opencv": read_video_opencv,
"pyav": read_video_pyav,
"torchvision": read_video_torchvision,
}
def load_video(
video: Union[str, "VideoInput"],
num_frames: Optional[int] = None,
fps: Optional[int] = None,
backend: str = "opencv",
sample_indices_fn: Optional[Callable] = None,
**kwargs,
) -> np.array:
"""
Loads `video` to a numpy array.
Args:
video (`str` or `VideoInput`):
The video to convert to the numpy array format. Can be a link to video or local path.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not passed, the whole video is loaded.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.
backend (`str`, *optional*, defaults to `"opencv"`):
The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
sample_indices_fn (`Callable`, *optional*):
A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
indices at which the video should be sampled. For example:
Example:
def sample_indices_fn(metadata, **kwargs):
return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)
Returns:
Tuple[`np.array`, Dict]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- Metadata dictionary.
"""
# If `sample_indices_fn` is given, we can accept any args as those might be needed by custom `sample_indices_fn`
if fps is not None and num_frames is not None and sample_indices_fn is None:
raise ValueError(
"`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!"
)
# If user didn't pass a sampling function, create one on the fly with default logic
if sample_indices_fn is None:
def sample_indices_fn_func(metadata, **fn_kwargs):
return default_sample_indices_fn(metadata, num_frames=num_frames, fps=fps, **fn_kwargs)
sample_indices_fn = sample_indices_fn_func
if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
if not is_yt_dlp_available():
raise ImportError("To load a video from YouTube url you have to install `yt_dlp` first.")
# Lazy import from yt_dlp
requires_backends(load_video, ["yt_dlp"])
from yt_dlp import YoutubeDL
buffer = BytesIO()
with redirect_stdout(buffer), YoutubeDL() as f:
f.download([video])
bytes_obj = buffer.getvalue()
file_obj = BytesIO(bytes_obj)
elif video.startswith("http://") or video.startswith("https://"):
file_obj = BytesIO(requests.get(video).content)
elif os.path.isfile(video):
file_obj = video
elif is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
file_obj = None
else:
raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
# can also load with decord, but not cv2/torchvision
# both will fail in case of url links
video_is_url = video.startswith("http://") or video.startswith("https://")
if video_is_url and backend in ["opencv", "torchvision"]:
raise ValueError(
"If you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend"
)
if file_obj is None:
return video
if (
(not is_decord_available() and backend == "decord")
or (not is_av_available() and backend == "pyav")
or (not is_cv2_available() and backend == "opencv")
or (not is_torchvision_available() and backend == "torchvision")
):
raise ImportError(
f"You chose backend={backend} for loading the video but the required library is not found in your environment "
f"Make sure to install {backend} before loading the video."
)
video_decoder = VIDEO_DECODERS[backend]
video, metadata = video_decoder(file_obj, sample_indices_fn, **kwargs)
return video, metadata
def load_images(
images: Union[list, tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None
) -> Union["PIL.Image.Image", list["PIL.Image.Image"], list[list["PIL.Image.Image"]]]:

View File

@ -122,7 +122,7 @@ def make_flex_block_causal_mask(
if attention_chunk_size is not None:
# we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
document_ids = (document_ids.fill_(1).cumsum(-1) - 1) // (attention_chunk_size)
chunk_idxs = (document_ids.clone().fill_(1).cumsum(-1) - 1) // (attention_chunk_size)
# Instead of passing a tensor mask, flex attention requires a mask_mod function
# that determines which elements of QK^T should be included in the attention
@ -143,6 +143,16 @@ def make_flex_block_causal_mask(
final_mask = causal_mask & padding_mask & document_mask
return final_mask
def chunk_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
"""
Combines the chunk mask with the causal mask for chunked attention.
"""
chunk_mask = chunk_idxs[batch_idx, q_idx] == chunk_idxs[batch_idx, kv_idx]
causal_doc_mask = causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx)
return chunk_mask & causal_doc_mask
mask_mod_maybe_combined = causal_mask_mod if attention_chunk_size is None else chunk_causal_mask_mod
if offsets is not None:
q_offset = offsets[0]
kv_offset = offsets[1]
@ -150,9 +160,9 @@ def make_flex_block_causal_mask(
def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
offset_q = q_idx + q_offset
offset_kv = kv_idx + kv_offset
return causal_mask_mod(batch_idx, head_idx, offset_q, offset_kv)
return mask_mod_maybe_combined(batch_idx, head_idx, offset_q, offset_kv)
else:
mask_mod = causal_mask_mod
mask_mod = mask_mod_maybe_combined
return create_block_causal_mask_flex(
mask_mod=mask_mod,
B=batch_size,

View File

@ -158,4 +158,5 @@ LOSS_MAPPING = {
"RTDetrForObjectDetection": RTDetrForObjectDetectionLoss,
"RTDetrV2ForObjectDetection": RTDetrForObjectDetectionLoss,
"DFineForObjectDetection": DFineForObjectDetectionLoss,
"CsmForConditionalGeneration": ForCausalLMLoss,
}

View File

@ -585,8 +585,8 @@ def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_
loaded_pt_weights_data_ptr = {}
missing_keys_pt = []
for pt_weight_name, pt_weight in current_pt_params_dict.items():
# Handle PyTorch shared weight ()not duplicated in TF 2.0
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
# Handle PyTorch shared weight not duplicated in TF 2.0
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr and pt_weight.data_ptr() != 0:
new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
continue

View File

@ -114,6 +114,7 @@ from .utils import (
is_torch_npu_available,
is_torch_sdpa_available,
is_torch_xla_available,
is_torch_xpu_available,
logging,
replace_return_docstrings,
strtobool,
@ -215,6 +216,25 @@ TORCH_INIT_FUNCTIONS = {
"kaiming_normal": nn.init.kaiming_normal,
}
# DO NOT MODIFY, KEPT FOR BC ONLY
VLMS = [
"aria",
"ayavision",
"emu3",
"fuyu",
"gotocr2",
"gemma3",
"internvl",
"llava", # all llava prefixed models fall under this check
"mistral3",
"mllama",
"paligemma",
"qwen2vl",
"qwen2_5_vl",
"videollava",
"vipllava",
]
@contextmanager
def no_init_weights():
@ -1274,7 +1294,7 @@ def _get_device_map(
)
if device_map != "sequential":
max_memory = get_balanced_memory(
inferred_max_memory = get_balanced_memory(
model,
dtype=target_dtype,
low_zero=(device_map == "balanced_low_0"),
@ -1282,17 +1302,23 @@ def _get_device_map(
**device_map_kwargs,
)
else:
max_memory = get_max_memory(max_memory)
inferred_max_memory = get_max_memory(max_memory)
if hf_quantizer is not None:
max_memory = hf_quantizer.adjust_max_memory(max_memory)
inferred_max_memory = hf_quantizer.adjust_max_memory(inferred_max_memory)
# CUDA: `max_memory` contains non-reserved memory. There may be *unused* reserved memory in the GPU, which we
# can use to allocate parameters.
for device_name in max_memory.keys():
# `inferred_max_memory` contains non-reserved memory. There may be *unused* reserved memory in the GPU,
# which we can use to allocate parameters.
for device_name in inferred_max_memory.keys():
if isinstance(device_name, int): # it's a GPU device
unused_memory = torch.cuda.memory_reserved(device_name) - torch.cuda.memory_allocated(device_name)
max_memory[device_name] += unused_memory
device_map_kwargs["max_memory"] = max_memory
if is_torch_xpu_available():
unused_memory = torch.xpu.memory_reserved(device_name) - torch.xpu.memory_allocated(device_name)
else:
unused_memory = torch.cuda.memory_reserved(device_name) - torch.cuda.memory_allocated(device_name)
inferred_max_memory[device_name] += unused_memory
# respect the `max_memory` passed by the user
if max_memory is not None and device_name in max_memory:
inferred_max_memory[device_name] = min(inferred_max_memory[device_name], max_memory[device_name])
device_map_kwargs["max_memory"] = inferred_max_memory
device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
@ -1771,6 +1797,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
main_input_name = "input_ids"
model_tags = None
_checkpoint_conversion_mapping = {} # used for BC support in VLMs, not meant to be used by new models
_auto_class = None
_no_split_modules = None
_skip_keys_device_placement = None
@ -3399,9 +3427,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
# Attach architecture to the config
model_to_save.config.architectures = [model_to_save.__class__.__name__]
# Unset attn implementation so it can be set to another one when loading back
model_to_save.config._attn_implementation_autoset = False
# If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
@ -3477,6 +3502,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
module_map[name + f".{key}"] = module
state_dict = model_to_save.state_dict()
if any(allowed_name in self.__class__.__name__.lower() for allowed_name in VLMS):
reverse_key_mapping = {v: k for k, v in self._checkpoint_conversion_mapping.items()}
original_state_dict = {}
for key, value in state_dict.items():
for pattern, replacement in reverse_key_mapping.items():
replacement = replacement.lstrip("^") # strip off un-needed chars and patterns
replacement = re.sub(r"\(.*?\)", "", pattern)
key, n_replace = re.subn(pattern, replacement, key)
# Early exit of the loop
if n_replace > 0:
break
original_state_dict[key] = value
state_dict = original_state_dict
# Translate state_dict from smp to hf if saving with smp >= 1.10
if IS_SAGEMAKER_MP_POST_1_10:
for smp_to_hf, _ in smp.state.module_manager.translate_functions:
@ -4064,7 +4104,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
gguf_file = kwargs.pop("gguf_file", None)
tp_plan = kwargs.pop("tp_plan", None)
tp_size = kwargs.pop("tp_size", None)
key_mapping = kwargs.pop("key_mapping", None)
# Load models with hardcoded key mapping on class for VLMs only, to keep BC and standardize model
if any(allowed_name in cls.__name__.lower() for allowed_name in VLMS):
key_mapping = kwargs.pop("key_mapping", cls._checkpoint_conversion_mapping)
else:
key_mapping = kwargs.pop("key_mapping", None)
# Not used anymore -- remove them from the kwargs
_ = kwargs.pop("resume_download", None)
_ = kwargs.pop("trust_remote_code", None)

View File

@ -69,6 +69,7 @@ if TYPE_CHECKING:
from .convnextv2 import *
from .cpm import *
from .cpmant import *
from .csm import *
from .ctrl import *
from .cvt import *
from .d_fine import *
@ -130,6 +131,7 @@ if TYPE_CHECKING:
from .granite import *
from .granite_speech import *
from .granitemoe import *
from .granitemoehybrid import *
from .granitemoeshared import *
from .grounding_dino import *
from .groupvit import *

View File

@ -41,22 +41,12 @@ from ...pytorch_utils import (
is_torch_greater_or_equal_than_2_2,
prune_linear_layer,
)
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils import ModelOutput, auto_docstring, logging
from .configuration_albert import AlbertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
_CONFIG_FOR_DOC = "AlbertConfig"
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
@ -553,12 +543,8 @@ class AlbertTransformer(nn.Module):
)
@auto_docstring
class AlbertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = AlbertConfig
load_tf_weights = load_tf_weights_in_albert
base_model_prefix = "albert"
@ -617,81 +603,16 @@ class AlbertForPreTrainingOutput(ModelOutput):
attentions: Optional[Tuple[torch.FloatTensor]] = None
ALBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Args:
config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ALBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING,
)
@auto_docstring
class AlbertModel(AlbertPreTrainedModel):
config_class = AlbertConfig
base_model_prefix = "albert"
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
@ -733,12 +654,7 @@ class AlbertModel(AlbertPreTrainedModel):
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -826,12 +742,11 @@ class AlbertModel(AlbertPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
`sentence order prediction (classification)` head.
""",
ALBERT_START_DOCSTRING,
"""
)
class AlbertForPreTraining(AlbertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
@ -855,8 +770,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
def get_input_embeddings(self) -> nn.Embedding:
return self.albert.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -881,8 +795,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
(see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
sequence B), `1` indicates switched order (sequence B, then sequence A).
Returns:
Example:
```python
@ -981,10 +893,7 @@ class AlbertSOPHead(nn.Module):
return logits
@add_start_docstrings(
"Albert Model with a `language modeling` head on top.",
ALBERT_START_DOCSTRING,
)
@auto_docstring
class AlbertForMaskedLM(AlbertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
@ -1007,8 +916,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
def get_input_embeddings(self) -> nn.Embedding:
return self.albert.embeddings.word_embeddings
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1028,8 +936,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Returns:
Example:
```python
@ -1093,12 +999,11 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
ALBERT_START_DOCSTRING,
"""
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
@ -1113,14 +1018,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="textattack/albert-base-v2-imdb",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'LABEL_1'",
expected_loss=0.12,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1194,13 +1092,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
)
@add_start_docstrings(
"""
Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
ALBERT_START_DOCSTRING,
)
@auto_docstring
class AlbertForTokenClassification(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
@ -1218,12 +1110,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1277,13 +1164,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
)
@add_start_docstrings(
"""
Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ALBERT_START_DOCSTRING,
)
@auto_docstring
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
@ -1295,16 +1176,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="twmkn9/albert-base-v2-squad2",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=12,
qa_target_end_index=13,
expected_output="'a nice puppet'",
expected_loss=7.36,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1319,16 +1191,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[AlbertForPreTrainingOutput, Tuple]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.albert(
@ -1380,13 +1242,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
)
@add_start_docstrings(
"""
Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ALBERT_START_DOCSTRING,
)
@auto_docstring
class AlbertForMultipleChoice(AlbertPreTrainedModel):
def __init__(self, config: AlbertConfig):
super().__init__(config)
@ -1398,12 +1254,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1418,6 +1269,30 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[AlbertForPreTrainingOutput, Tuple]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see

View File

@ -31,154 +31,12 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils import ModelOutput, auto_docstring, logging
from .configuration_align import AlignConfig, AlignTextConfig, AlignVisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "kakaobrain/align-base"
_CONFIG_FOR_DOC = "AlignConfig"
ALIGN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AlignConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ALIGN_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
ALIGN_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
ALIGN_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@dataclass
class AlignVisionModelOutput(ModelOutput):
@ -1165,12 +1023,8 @@ class AlignTextPooler(nn.Module):
return pooled_output
@auto_docstring
class AlignPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = AlignConfig
base_model_prefix = "align"
supports_gradient_checkpointing = True
@ -1194,15 +1048,20 @@ class AlignPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0)
@add_start_docstrings(
"""The text model from ALIGN without any head or projection on top.""",
ALIGN_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The text model from ALIGN without any head or projection on top.
"""
)
class AlignTextModel(AlignPreTrainedModel):
config_class = AlignTextConfig
_no_split_modules = ["AlignTextEmbeddings"]
def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
@ -1220,8 +1079,7 @@ class AlignTextModel(AlignPreTrainedModel):
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
@add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1235,8 +1093,6 @@ class AlignTextModel(AlignPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
r"""
Returns:
Examples:
```python
@ -1321,9 +1177,10 @@ class AlignTextModel(AlignPreTrainedModel):
)
@add_start_docstrings(
"""The vision model from ALIGN without any head or projection on top.""",
ALIGN_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The vision model from ALIGN without any head or projection on top.
"""
)
class AlignVisionModel(AlignPreTrainedModel):
config_class = AlignVisionConfig
@ -1350,8 +1207,7 @@ class AlignVisionModel(AlignPreTrainedModel):
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.convolution
@add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1359,8 +1215,6 @@ class AlignVisionModel(AlignPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
r"""
Returns:
Examples:
```python
@ -1410,7 +1264,7 @@ class AlignVisionModel(AlignPreTrainedModel):
)
@add_start_docstrings(ALIGN_START_DOCSTRING)
@auto_docstring
class AlignModel(AlignPreTrainedModel):
config_class = AlignConfig
@ -1444,7 +1298,7 @@ class AlignModel(AlignPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
@auto_docstring
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1497,7 +1351,7 @@ class AlignModel(AlignPreTrainedModel):
return text_features
@add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING)
@auto_docstring
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1542,8 +1396,7 @@ class AlignModel(AlignPreTrainedModel):
return image_features
@add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1559,7 +1412,8 @@ class AlignModel(AlignPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, AlignOutput]:
r"""
Returns:
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
Examples:

View File

@ -32,125 +32,12 @@ from ...modeling_outputs import (
)
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from ...utils import ModelOutput, auto_docstring, logging, torch_int
from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "BAAI/AltCLIP"
_CONFIG_FOR_DOC = "AltCLIPConfig"
ALTCLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ALTCLIP_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
ALTCLIP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# contrastive loss function, adapted from
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
@ -1076,12 +963,8 @@ class AltCLIPVisionEmbeddings(nn.Module):
return embeddings
@auto_docstring
class AltCLIPPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = AltCLIPConfig
base_model_prefix = "altclip"
supports_gradient_checkpointing = True
@ -1144,8 +1027,7 @@ class AltCLIPVisionTransformer(nn.Module):
self.encoder = AltCLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1154,10 +1036,6 @@ class AltCLIPVisionTransformer(nn.Module):
return_dict: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -1205,8 +1083,7 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel):
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1216,8 +1093,6 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```python
@ -1248,9 +1123,8 @@ class AltCLIPVisionModel(AltCLIPPreTrainedModel):
)
class AltRobertaModel(AltCLIPPreTrainedModel):
"""
@auto_docstring(
custom_intro="""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in *Attention is
all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
@ -1261,13 +1135,17 @@ class AltRobertaModel(AltCLIPPreTrainedModel):
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
)
class AltRobertaModel(AltCLIPPreTrainedModel):
config_class = AltCLIPTextConfig
# Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->AltRoberta
def __init__(self, config, add_pooling_layer=True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
@ -1293,6 +1171,7 @@ class AltRobertaModel(AltCLIPPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@auto_docstring
# Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
def forward(
self,
@ -1310,26 +1189,6 @@ class AltRobertaModel(AltCLIPPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -1444,8 +1303,7 @@ class AltCLIPTextModel(AltCLIPPreTrainedModel):
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
return super().resize_token_embeddings(new_num_tokens)
@add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndProjection, config_class=AltCLIPTextConfig)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1461,8 +1319,6 @@ class AltCLIPTextModel(AltCLIPPreTrainedModel):
output_hidden_states: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]:
r"""
Returns:
Examples:
```python
@ -1551,7 +1407,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING)
@auto_docstring
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1598,7 +1454,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
return text_features
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
@auto_docstring
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1646,8 +1502,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
return image_features
@add_start_docstrings_to_model_forward(ALTCLIP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AltCLIPOutput, config_class=AltCLIPConfig)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1662,7 +1517,8 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, AltCLIPOutput]:
r"""
Returns:
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
Examples:

View File

@ -18,12 +18,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Iterable, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
@ -71,23 +70,6 @@ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> Li
return patches
def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution
scale_w = target_width / original_width
scale_h = target_height / original_height
if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)
return new_height, new_width
class AriaImageProcessor(BaseImageProcessor):
"""
A vision processor for the Aria model that handles image preprocessing.
@ -375,7 +357,7 @@ class AriaImageProcessor(BaseImageProcessor):
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
@ -389,12 +371,12 @@ class AriaImageProcessor(BaseImageProcessor):
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)
padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))
return padded_image

View File

@ -34,15 +34,13 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import (
LossKwargs,
add_start_docstrings,
add_start_docstrings_to_model_forward,
auto_docstring,
can_return_tuple,
is_torch_flex_attn_available,
logging,
replace_return_docstrings,
)
from ...utils.import_utils import is_torch_available
from ..auto import AutoModel, AutoModelForCausalLM
from ..auto import AutoModel
from .configuration_aria import AriaConfig, AriaTextConfig
@ -58,7 +56,6 @@ if is_torch_flex_attn_available():
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "AriaTextConfig"
@use_kernel_forward_from_hub("RMSNorm")
@ -654,12 +651,9 @@ class AriaTextDecoderLayer(GradientCheckpointingLayer):
return outputs
@auto_docstring
class AriaTextPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
"""
config_class = AriaConfig
config_class = AriaTextConfig
base_model_prefix = "model"
_no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"]
supports_gradient_checkpointing = True
@ -667,6 +661,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
_supports_flash_attn_2 = False
_supports_sdpa = True
_supports_cache_class = True
_supports_attention_backend = True
def _init_weights(self, module):
std = self.config.initializer_range
@ -684,30 +679,10 @@ class AriaTextPreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=std)
ARIA_TEXT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AriaTextConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Aria Model outputting raw hidden-states without any specific head on top.",
ARIA_TEXT_START_DOCSTRING,
)
@auto_docstring
class AriaPreTrainedModel(PreTrainedModel):
config_class = AriaTextConfig
base_model_prefix = "model"
config_class = AriaConfig
base_model_prefix = ""
supports_gradient_checkpointing = True
_no_split_modules = ["AriaDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
@ -770,88 +745,8 @@ class AriaTextRotaryEmbedding(nn.Module):
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
ARIA_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
but you can also pass a `BlockMask` object directly here.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_values (`Cache`, *optional*):
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@add_start_docstrings(
"The bare AriaText Model outputting raw hidden-states without any specific head on top.",
ARIA_TEXT_START_DOCSTRING,
)
@auto_docstring
class AriaTextModel(AriaTextPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AriaTextDecoderLayer`]
Args:
config: AriaTextConfig
"""
def __init__(self, config: AriaTextConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
@ -875,7 +770,7 @@ class AriaTextModel(AriaTextPreTrainedModel):
self.embed_tokens = value
@can_return_tuple
@add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1097,22 +992,11 @@ class AriaTextModel(AriaTextPreTrainedModel):
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
@auto_docstring
class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
"""
Aria model for causal language modeling tasks.
This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach,
allowing for more efficient and scalable language modeling.
Args:
config (`AriaTextConfig`):
Configuration object for the model.
"""
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
config_class = AriaTextConfig
def __init__(self, config: AriaTextConfig):
super().__init__(config)
@ -1141,9 +1025,7 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
def get_decoder(self):
return self.model
@can_return_tuple
@add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1160,19 +1042,10 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
**kwargs: Unpack[KwargsForCausalLM],
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
@ -1255,7 +1128,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`torch.FloatTensor`, *optional*):
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
"""
@ -1267,83 +1140,168 @@ class AriaCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[torch.FloatTensor] = None
ARIA_INPUTS_DOCSTRING = r"""
@dataclass
class AriaModelOutputWithPast(BaseModelOutputWithPast):
"""
Base class for Aria outputs, with hidden states and attentions.
Args:
input_ids (`torch.LongTensor`, *optional*):
Input token IDs.
pixel_values (`torch.FloatTensor`, *optional*):
Pixel values of the images.
pixel_mask (`torch.LongTensor`, *optional*):
Mask for the pixel values.
attention_mask (`torch.Tensor`, *optional*):
Attention mask.
position_ids (`torch.LongTensor`, *optional*):
Position IDs.
past_key_values (`List[torch.FloatTensor]`, *optional*):
Past key values for efficient processing.
inputs_embeds (`torch.FloatTensor`, *optional*):
Input embeddings.
labels (`torch.LongTensor`, *optional*):
Labels for computing the language modeling loss.
use_cache (`bool`, *optional*):
Whether to use the model's cache mechanism.
output_attentions (`bool`, *optional*):
Whether to output attention weights.
output_hidden_states (`bool`, *optional*):
Whether to output hidden states.
return_dict (`bool`, *optional*):
Whether to return a `ModelOutput` object.
logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
If an `int`, calculate logits for the last `logits_to_keep` tokens, or all `input_ids` if `0`.
Otherwise, slice according to the 1D tensor in the sequence length dimension
cache_position (`torch.LongTensor`, *optional*):
Cache positions.
**loss_kwargs:
Additional keyword arguments for loss calculation.
"""
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
ARIA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Parameters:
config (`AriaConfig`):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`torch.FloatTensor`, *optional*):
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
"""
image_hidden_states: Optional[torch.FloatTensor] = None
@add_start_docstrings(
"""Aria model for conditional generation tasks.
This model combines a vision tower, a multi-modal projector, and a language model
to perform tasks that involve both image and text inputs.""",
ARIA_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The Aria model which consists of a vision backbone and a language model, without a language modeling head.
"""
)
class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
config_class = AriaConfig
_supports_flash_attn_2 = False
_supports_flex_attn = False
_supports_sdpa = False
_tied_weights_keys = ["language_model.lm_head.weight"]
class AriaModel(AriaPreTrainedModel):
_checkpoint_conversion_mapping = {"language_model.model": "language_model"}
def __init__(self, config: AriaConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
self.multi_modal_projector = AriaProjector(config)
self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
self.language_model = AutoModel.from_config(config.text_config)
self.post_init()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def get_image_features(
self,
pixel_values: torch.FloatTensor,
pixel_mask: torch.FloatTensor = None,
vision_feature_layer: int = -1,
):
"""
Obtains image last hidden states from the vision tower and apply multimodal projection.
Args:
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
The tensors corresponding to the input images.
pixel_mask (`torch.FloatTensor]`, *optional*):
The tensors corresponding to the input image mask.
vision_feature_layer (`Union[int, List[int]]`):
The index of the layer to select the vision feature. If multiple indices are provided,
the vision feature of the corresponding indices will be concatenated to form the
vision features.
Returns:
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
"""
patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
image_outputs = self.vision_tower(
pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True
)
image_attn_mask = None
if patch_attention_mask is not None:
flattened_mask = patch_attention_mask.flatten(1)
image_attn_mask = torch.logical_not(flattened_mask)
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask)
return image_features
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[Tuple, AriaModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
# 2. Merge text and images
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=self.config.vision_feature_layer,
)
n_images, n_features_per_image = image_features.shape[0], image_features.shape[1]
n_image_features = n_images * n_features_per_image
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs = self.language_model(
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
cache_position=cache_position,
**kwargs,
)
return AriaModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values if use_cache else None,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
)
def _create_patch_attention_mask(self, pixel_mask):
if pixel_mask is None:
return None
@ -1360,51 +1318,62 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
)
return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
@auto_docstring(
custom_intro="""
Aria model for conditional generation tasks.
This model combines a vision tower, a multi-modal projector, and a language model
to perform tasks that involve both image and text inputs.
"""
)
class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = {
"^language_model.model": "model.language_model",
"^vision_tower": "model.vision_tower",
"^multi_modal_projector": "model.multi_modal_projector",
"^language_model.lm_head": "lm_head",
}
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: AriaConfig):
super().__init__(config)
self.model = AriaModel(config)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
return self.model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
self.model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.language_model.get_output_embeddings()
def get_output_embeddings(self) -> nn.Module:
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.language_model.set_decoder(decoder)
# Make modules available throught conditional class for BC
@property
def language_model(self):
return self.model.language_model
def get_decoder(self):
return self.language_model.get_decoder()
@property
def vision_tower(self):
return self.model.vision_tower
def get_image_features(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
vision_feature_layer: int = -1,
):
patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
image_outputs = self.vision_tower(
pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True
)
image_attn_mask = None
if patch_attention_mask is not None:
flattened_mask = patch_attention_mask.flatten(1)
image_attn_mask = torch.logical_not(flattened_mask)
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask)
return image_features
@property
def multi_modal_projector(self):
return self.model.multi_modal_projector
@can_return_tuple
@add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_mask: Optional[torch.LongTensor] = None,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
@ -1413,17 +1382,17 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
cache_position: Optional[torch.LongTensor] = None,
**loss_kwargs,
) -> AriaCausalLMOutputWithPast:
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, AriaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
@ -1482,37 +1451,12 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
# 2. Merge text and images
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=self.config.vision_feature_layer,
)
n_images, n_features_per_image = image_features.shape[0], image_features.shape[1]
n_image_features = n_images * n_features_per_image
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs: CausalLMOutputWithPast = self.language_model(
outputs = self.model(
input_ids=input_ids,
pixel_values=pixel_values,
pixel_mask=pixel_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
@ -1520,16 +1464,20 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
logits_to_keep=logits_to_keep,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)
logits = outputs.logits
hidden_states = outputs[0]
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
logits = self.lm_head(hidden_states[:, slice_indices, :])
loss = None
if labels is not None:
loss = self.loss_function(
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
)
return AriaCausalLMOutputWithPast(
@ -1552,7 +1500,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
logits_to_keep=None,
**kwargs,
):
model_inputs = self.language_model.prepare_inputs_for_generation(
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
@ -1570,11 +1518,67 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
return model_inputs
@staticmethod
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: torch.Tensor,
sequence_length: int,
target_length: int,
dtype: torch.dtype,
cache_position: torch.Tensor,
batch_size: int,
**kwargs,
):
"""
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
Args:
attention_mask (`torch.Tensor`):
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
`(batch_size, 1, query_length, key_value_length)`.
sequence_length (`int`):
The sequence length being processed.
target_length (`int`):
The target length: when generating with static cache, the mask should be as long as the static cache,
to account for the 0 padding, the part of the cache that is not filled yet.
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
Batch size.
"""
if attention_mask is not None and attention_mask.dim() == 4:
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
causal_mask = attention_mask
else:
min_dtype = torch.finfo(dtype).min
causal_mask = torch.full(
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
causal_mask.device
)
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
return causal_mask
__all__ = [
"AriaForConditionalGeneration",
"AriaPreTrainedModel",
"AriaTextPreTrainedModel",
"AriaTextModel",
"AriaModel",
"AriaTextForCausalLM",
]

View File

@ -12,15 +12,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...activations import ACT2FN
from ...configuration_utils import PretrainedConfig
from ...generation import GenerationMixin
from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
@ -34,23 +32,13 @@ from ...image_utils import (
valid_images,
validate_preprocess_arguments,
)
from ...modeling_outputs import CausalLMOutputWithPast
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_utils import PreTrainedModel
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import (
PreTokenizedInput,
TextInput,
)
from ...utils import (
TensorType,
add_start_docstrings,
add_start_docstrings_to_model_forward,
can_return_tuple,
logging,
replace_return_docstrings,
)
from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import LossKwargs, TensorType, auto_docstring, can_return_tuple, logging
from ...utils.import_utils import is_torch_available
from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
from ..llama.configuration_llama import LlamaConfig
from ..llama.modeling_llama import (
LlamaDecoderLayer,
@ -60,7 +48,12 @@ from ..llama.modeling_llama import (
LlamaPreTrainedModel,
LlamaRMSNorm,
)
from ..llava.modeling_llava import LlavaCausalLMOutputWithPast
from ..llava.modeling_llava import (
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
LlavaModelOutputWithPast,
)
from ..llava_next.image_processing_llava_next import divide_to_patches
@ -461,23 +454,6 @@ class AriaProjector(nn.Module):
return out
def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution
scale_w = target_width / original_width
scale_h = target_height / original_height
if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)
return new_height, new_width
class AriaImageProcessor(BaseImageProcessor):
"""
A vision processor for the Aria model that handles image preprocessing.
@ -765,7 +741,7 @@ class AriaImageProcessor(BaseImageProcessor):
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
@ -779,12 +755,12 @@ class AriaImageProcessor(BaseImageProcessor):
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)
padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))
return padded_image
@ -1236,12 +1212,9 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
self.mlp = AriaTextMoELayer(config)
@auto_docstring
class AriaTextPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
"""
config_class = AriaConfig
config_class = AriaTextConfig
base_model_prefix = "model"
_no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"]
supports_gradient_checkpointing = True
@ -1249,6 +1222,7 @@ class AriaTextPreTrainedModel(PreTrainedModel):
_supports_flash_attn_2 = False
_supports_sdpa = True
_supports_cache_class = True
_supports_attention_backend = True
def _init_weights(self, module):
std = self.config.initializer_range
@ -1267,6 +1241,8 @@ class AriaTextPreTrainedModel(PreTrainedModel):
class AriaPreTrainedModel(LlamaPreTrainedModel):
config_class = AriaConfig
base_model_prefix = ""
_supports_static_cache = False # MoE models don't work with torch.compile (dynamic slicing)
_supports_attention_backend = False
@ -1297,20 +1273,11 @@ class AriaTextModel(LlamaModel):
self.post_init()
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
"""
Aria model for causal language modeling tasks.
This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach,
allowing for more efficient and scalable language modeling.
Args:
config (`AriaTextConfig`):
Configuration object for the model.
"""
_tied_weights_keys = ["lm_head.weight"]
config_class = AriaTextConfig
def __init__(self, config: AriaTextConfig):
super().__init__(config)
@ -1321,87 +1288,23 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
# Initialize weights and apply final processing
self.post_init()
@auto_docstring
def forward(self, **super_kwargs):
super().forward(self, **super_kwargs)
class AriaCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
pass
ARIA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor`, *optional*):
Input token IDs.
pixel_values (`torch.FloatTensor`, *optional*):
Pixel values of the images.
pixel_mask (`torch.LongTensor`, *optional*):
Mask for the pixel values.
attention_mask (`torch.Tensor`, *optional*):
Attention mask.
position_ids (`torch.LongTensor`, *optional*):
Position IDs.
past_key_values (`List[torch.FloatTensor]`, *optional*):
Past key values for efficient processing.
inputs_embeds (`torch.FloatTensor`, *optional*):
Input embeddings.
labels (`torch.LongTensor`, *optional*):
Labels for computing the language modeling loss.
use_cache (`bool`, *optional*):
Whether to use the model's cache mechanism.
output_attentions (`bool`, *optional*):
Whether to output attention weights.
output_hidden_states (`bool`, *optional*):
Whether to output hidden states.
return_dict (`bool`, *optional*):
Whether to return a `ModelOutput` object.
logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
If an `int`, calculate logits for the last `logits_to_keep` tokens, or all `input_ids` if `0`.
Otherwise, slice according to the 1D tensor in the sequence length dimension
cache_position (`torch.LongTensor`, *optional*):
Cache positions.
**loss_kwargs:
Additional keyword arguments for loss calculation.
"""
ARIA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config (`AriaConfig`):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class AriaModelOutputWithPast(LlavaModelOutputWithPast):
pass
@add_start_docstrings(
"""Aria model for conditional generation tasks.
This model combines a vision tower, a multi-modal projector, and a language model
to perform tasks that involve both image and text inputs.""",
ARIA_START_DOCSTRING,
)
class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
config_class = AriaConfig
_supports_flash_attn_2 = False
_supports_flex_attn = False
_supports_sdpa = False
_tied_weights_keys = ["language_model.lm_head.weight"]
class AriaModel(LlavaModel):
def __init__(self, config: AriaConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
self.multi_modal_projector = AriaProjector(config)
self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
self.post_init()
def _create_patch_attention_mask(self, pixel_mask):
if pixel_mask is None:
@ -1419,30 +1322,27 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
)
return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
def get_input_embeddings(self):
return self.language_model.get_input_embeddings()
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.language_model.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
def set_decoder(self, decoder):
self.language_model.set_decoder(decoder)
def get_decoder(self):
return self.language_model.get_decoder()
def get_image_features(
self,
pixel_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
pixel_mask: torch.FloatTensor = None,
vision_feature_layer: int = -1,
):
"""
Obtains image last hidden states from the vision tower and apply multimodal projection.
Args:
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
The tensors corresponding to the input images.
pixel_mask (`torch.FloatTensor]`, *optional*):
The tensors corresponding to the input image mask.
vision_feature_layer (`Union[int, List[int]]`):
The index of the layer to select the vision feature. If multiple indices are provided,
the vision feature of the corresponding indices will be concatenated to form the
vision features.
Returns:
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
"""
patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
image_outputs = self.vision_tower(
pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True
@ -1456,14 +1356,95 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask)
return image_features
@can_return_tuple
@add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
pixel_mask: Optional[torch.LongTensor] = None,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[Tuple, AriaModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
# 2. Merge text and images
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=self.config.vision_feature_layer,
)
n_images, n_features_per_image = image_features.shape[0], image_features.shape[1]
n_image_features = n_images * n_features_per_image
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs = self.language_model(
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
cache_position=cache_position,
**kwargs,
)
return AriaModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values if use_cache else None,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
)
@auto_docstring(
custom_intro="""
Aria model for conditional generation tasks.
This model combines a vision tower, a multi-modal projector, and a language model
to perform tasks that involve both image and text inputs.
"""
)
class AriaForConditionalGeneration(LlavaForConditionalGeneration):
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
pixel_mask: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
@ -1472,17 +1453,17 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
cache_position: Optional[torch.LongTensor] = None,
**loss_kwargs,
) -> AriaCausalLMOutputWithPast:
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, AriaCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
@ -1541,37 +1522,12 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
# 2. Merge text and images
if pixel_values is not None and inputs_embeds.shape[1] != 1:
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
image_embeds = input_ids == self.config.image_token_id
special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0)
image_features = self.get_image_features(
pixel_values=pixel_values,
pixel_mask=pixel_mask,
vision_feature_layer=self.config.vision_feature_layer,
)
n_images, n_features_per_image = image_features.shape[0], image_features.shape[1]
n_image_features = n_images * n_features_per_image
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs: CausalLMOutputWithPast = self.language_model(
outputs = self.model(
input_ids=input_ids,
pixel_values=pixel_values,
pixel_mask=pixel_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
@ -1579,16 +1535,20 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
logits_to_keep=logits_to_keep,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)
logits = outputs.logits
hidden_states = outputs[0]
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
logits = self.lm_head(hidden_states[:, slice_indices, :])
loss = None
if labels is not None:
loss = self.loss_function(
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
)
return AriaCausalLMOutputWithPast(
@ -1611,7 +1571,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
logits_to_keep=None,
**kwargs,
):
model_inputs = self.language_model.prepare_inputs_for_generation(
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
@ -1639,5 +1599,6 @@ __all__ = [
"AriaPreTrainedModel",
"AriaTextPreTrainedModel",
"AriaTextModel",
"AriaModel",
"AriaTextForCausalLM",
]

View File

@ -25,24 +25,12 @@ from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from ...utils import auto_docstring, logging
from .configuration_audio_spectrogram_transformer import ASTConfig
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "ASTConfig"
# Base docstring
_CHECKPOINT_FOR_DOC = "MIT/ast-finetuned-audioset-10-10-0.4593"
_EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
# Audio classification docstring
_SEQ_CLASS_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"
_SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'"
_SEQ_CLASS_EXPECTED_LOSS = 0.17
class ASTEmbeddings(nn.Module):
"""
@ -388,12 +376,8 @@ class ASTEncoder(nn.Module):
)
@auto_docstring
class ASTPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ASTConfig
base_model_prefix = "audio_spectrogram_transformer"
main_input_name = "input_values"
@ -420,48 +404,7 @@ class ASTPreTrainedModel(PreTrainedModel):
module.distillation_token.data.zero_()
AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`ASTConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
Args:
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare AST Model transformer outputting raw hidden-states without any specific head on top.",
AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
)
@auto_docstring
class ASTModel(ASTPreTrainedModel):
def __init__(self, config: ASTConfig) -> None:
super().__init__(config)
@ -486,14 +429,7 @@ class ASTModel(ASTPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
@auto_docstring
def forward(
self,
input_values: Optional[torch.Tensor] = None,
@ -502,6 +438,14 @@ class ASTModel(ASTPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -555,12 +499,11 @@ class ASTMLPHead(nn.Module):
return hidden_state
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled
output) e.g. for datasets like AudioSet, Speech Commands v2.
""",
AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
"""
)
class ASTForAudioClassification(ASTPreTrainedModel):
def __init__(self, config: ASTConfig) -> None:
@ -575,15 +518,7 @@ class ASTForAudioClassification(ASTPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_SEQ_CLASS_CHECKPOINT,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
@auto_docstring
def forward(
self,
input_values: Optional[torch.Tensor] = None,
@ -594,6 +529,12 @@ class ASTForAudioClassification(ASTPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[tuple, SequenceClassifierOutput]:
r"""
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -27,6 +27,7 @@ if TYPE_CHECKING:
from .modeling_tf_auto import *
from .processing_auto import *
from .tokenization_auto import *
from .video_processing_auto import *
else:
import sys

View File

@ -82,6 +82,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("convnext", "ConvNextConfig"),
("convnextv2", "ConvNextV2Config"),
("cpmant", "CpmAntConfig"),
("csm", "CsmConfig"),
("ctrl", "CTRLConfig"),
("cvt", "CvtConfig"),
("d_fine", "DFineConfig"),
@ -148,6 +149,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("granite", "GraniteConfig"),
("granite_speech", "GraniteSpeechConfig"),
("granitemoe", "GraniteMoeConfig"),
("granitemoehybrid", "GraniteMoeHybridConfig"),
("granitemoeshared", "GraniteMoeSharedConfig"),
("granitevision", "LlavaNextConfig"),
("graphormer", "GraphormerConfig"),
@ -440,6 +442,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("convnextv2", "ConvNeXTV2"),
("cpm", "CPM"),
("cpmant", "CPM-Ant"),
("csm", "CSM"),
("ctrl", "CTRL"),
("cvt", "CvT"),
("d_fine", "D-FINE"),
@ -513,6 +516,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("granite", "Granite"),
("granite_speech", "GraniteSpeech"),
("granitemoe", "GraniteMoeMoe"),
("granitemoehybrid", "GraniteMoeHybrid"),
("granitemoeshared", "GraniteMoeSharedMoe"),
("granitevision", "LLaVA-NeXT"),
("graphormer", "Graphormer"),

View File

@ -59,8 +59,8 @@ else:
("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
("aria", ("AriaImageProcessor",)),
("beit", ("BeitImageProcessor",)),
("aria", ("AriaImageProcessor")),
("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
("bit", ("BitImageProcessor", "BitImageProcessorFast")),
("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
("blip-2", ("BlipImageProcessor", "BlipImageProcessorFast")),
@ -73,7 +73,7 @@ else:
("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("data2vec-vision", ("BeitImageProcessor",)),
("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
("depth_anything", ("DPTImageProcessor",)),
@ -130,7 +130,7 @@ else:
("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")),
("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")),
("phi4_multimodal", "Phi4MultimodalImageProcessorFast"),
("phi4_multimodal", ("Phi4MultimodalImageProcessorFast",)),
("pix2struct", ("Pix2StructImageProcessor",)),
("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")),
@ -152,7 +152,7 @@ else:
("superglue", ("SuperGlueImageProcessor",)),
("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
("swin2sr", ("Swin2SRImageProcessor",)),
("swin2sr", ("Swin2SRImageProcessor", "Swin2SRImageProcessorFast")),
("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
("table-transformer", ("DetrImageProcessor",)),
("timesformer", ("VideoMAEImageProcessor",)),

View File

@ -37,10 +37,11 @@ MODEL_MAPPING_NAMES = OrderedDict(
("albert", "AlbertModel"),
("align", "AlignModel"),
("altclip", "AltCLIPModel"),
("aria", "AriaForConditionalGeneration"),
("aria", "AriaModel"),
("aria_text", "AriaTextModel"),
("audio-spectrogram-transformer", "ASTModel"),
("autoformer", "AutoformerModel"),
("aya_vision", "AyaVisionModel"),
("bamba", "BambaModel"),
("bark", "BarkModel"),
("bart", "BartModel"),
@ -80,6 +81,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("convnext", "ConvNextModel"),
("convnextv2", "ConvNextV2Model"),
("cpmant", "CpmAntModel"),
("csm", "CsmForConditionalGeneration"),
("ctrl", "CTRLModel"),
("cvt", "CvtModel"),
("d_fine", "DFineModel"),
@ -109,6 +111,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("efficientformer", "EfficientFormerModel"),
("efficientnet", "EfficientNetModel"),
("electra", "ElectraModel"),
("emu3", "Emu3Model"),
("encodec", "EncodecModel"),
("ernie", "ErnieModel"),
("ernie_m", "ErnieMModel"),
@ -122,14 +125,16 @@ MODEL_MAPPING_NAMES = OrderedDict(
("focalnet", "FocalNetModel"),
("fsmt", "FSMTModel"),
("funnel", ("FunnelModel", "FunnelBaseModel")),
("fuyu", "FuyuModel"),
("gemma", "GemmaModel"),
("gemma2", "Gemma2Model"),
("gemma3", "Gemma3Model"),
("gemma3_text", "Gemma3TextModel"),
("git", "GitModel"),
("glm", "GlmModel"),
("glm4", "Glm4Model"),
("glpn", "GLPNModel"),
("got_ocr2", "GotOcr2ForConditionalGeneration"),
("got_ocr2", "GotOcr2Model"),
("gpt-sw3", "GPT2Model"),
("gpt2", "GPT2Model"),
("gpt_bigcode", "GPTBigCodeModel"),
@ -140,6 +145,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("granite", "GraniteModel"),
("granitemoe", "GraniteMoeModel"),
("granitemoehybrid", "GraniteMoeHybridModel"),
("granitemoeshared", "GraniteMoeSharedModel"),
("graphormer", "GraphormerModel"),
("grounding-dino", "GroundingDinoModel"),
@ -156,6 +162,9 @@ MODEL_MAPPING_NAMES = OrderedDict(
("ijepa", "IJepaModel"),
("imagegpt", "ImageGPTModel"),
("informer", "InformerModel"),
("instructblip", "InstructBlipModel"),
("instructblipvideo", "InstructBlipVideoModel"),
("internvl", "InternVLModel"),
("internvl_vision", "InternVLVisionModel"),
("jamba", "JambaModel"),
("janus", "JanusModel"),
@ -170,6 +179,10 @@ MODEL_MAPPING_NAMES = OrderedDict(
("lilt", "LiltModel"),
("llama", "LlamaModel"),
("llama4", "Llama4ForConditionalGeneration"),
("llava", "LlavaModel"),
("llava_next", "LlavaNextModel"),
("llava_next_video", "LlavaNextVideoModel"),
("llava_onevision", "LlavaOnevisionModel"),
("longformer", "LongformerModel"),
("longt5", "LongT5Model"),
("luke", "LukeModel"),
@ -189,8 +202,10 @@ MODEL_MAPPING_NAMES = OrderedDict(
("mgp-str", "MgpstrForSceneTextRecognition"),
("mimi", "MimiModel"),
("mistral", "MistralModel"),
("mistral3", "Mistral3Model"),
("mixtral", "MixtralModel"),
("mlcd", "MLCDVisionModel"),
("mllama", "MllamaModel"),
("mobilebert", "MobileBertModel"),
("mobilenet_v1", "MobileNetV1Model"),
("mobilenet_v2", "MobileNetV2Model"),
@ -221,6 +236,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("opt", "OPTModel"),
("owlv2", "Owlv2Model"),
("owlvit", "OwlViTModel"),
("paligemma", "PaliGemmaModel"),
("patchtsmixer", "PatchTSMixerModel"),
("patchtst", "PatchTSTModel"),
("pegasus", "PegasusModel"),
@ -240,11 +256,11 @@ MODEL_MAPPING_NAMES = OrderedDict(
("qdqbert", "QDQBertModel"),
("qwen2", "Qwen2Model"),
("qwen2_5_vl", "Qwen2_5_VLModel"),
("qwen2_5_vl_text", "Qwen2_5_VLModel"),
("qwen2_5_vl_text", "Qwen2_5_VLTextModel"),
("qwen2_audio_encoder", "Qwen2AudioEncoder"),
("qwen2_moe", "Qwen2MoeModel"),
("qwen2_vl", "Qwen2VLModel"),
("qwen2_vl_text", "Qwen2VLModel"),
("qwen2_vl_text", "Qwen2VLTextModel"),
("qwen3", "Qwen3Model"),
("qwen3_moe", "Qwen3MoeModel"),
("recurrent_gemma", "RecurrentGemmaModel"),
@ -306,8 +322,10 @@ MODEL_MAPPING_NAMES = OrderedDict(
("unispeech-sat", "UniSpeechSatModel"),
("univnet", "UnivNetModel"),
("van", "VanModel"),
("video_llava", "VideoLlavaModel"),
("videomae", "VideoMAEModel"),
("vilt", "ViltModel"),
("vipllava", "VipLlavaModel"),
("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
("visual_bert", "VisualBertModel"),
("vit", "ViTModel"),
@ -560,6 +578,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("gptj", "GPTJForCausalLM"),
("granite", "GraniteForCausalLM"),
("granitemoe", "GraniteMoeForCausalLM"),
("granitemoehybrid", "GraniteMoeHybridForCausalLM"),
("granitemoeshared", "GraniteMoeSharedForCausalLM"),
("helium", "HeliumForCausalLM"),
("jamba", "JambaForCausalLM"),
@ -879,6 +898,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("llama4", "Llama4ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
("mistral3", "Mistral3ForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
@ -1447,6 +1467,7 @@ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict(
[
# Model for Text-To-Waveform mapping
("bark", "BarkModel"),
("csm", "CsmForConditionalGeneration"),
("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"),
("musicgen", "MusicgenForConditionalGeneration"),
("musicgen_melody", "MusicgenMelodyForConditionalGeneration"),

View File

@ -28,7 +28,14 @@ from ...feature_extraction_utils import FeatureExtractionMixin
from ...image_processing_utils import ImageProcessingMixin
from ...processing_utils import ProcessorMixin
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, cached_file, logging
from ...utils import (
FEATURE_EXTRACTOR_NAME,
PROCESSOR_NAME,
VIDEO_PROCESSOR_NAME,
cached_file,
logging,
)
from ...video_processing_utils import BaseVideoProcessor
from .auto_factory import _LazyAutoMapping
from .configuration_auto import (
CONFIG_MAPPING_NAMES,
@ -296,14 +303,31 @@ class AutoProcessor:
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
# If not found, let's check whether the processor class is saved in a feature extractor config
if preprocessor_config_file is not None and processor_class is None:
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
pretrained_model_name_or_path, **kwargs
# Saved as video processor
if preprocessor_config_file is None:
preprocessor_config_file = cached_file(
pretrained_model_name_or_path, VIDEO_PROCESSOR_NAME, **cached_file_kwargs
)
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
if preprocessor_config_file is not None:
config_dict, _ = BaseVideoProcessor.get_video_processor_dict(
pretrained_model_name_or_path, **kwargs
)
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
# Saved as feature extractor
if preprocessor_config_file is None:
preprocessor_config_file = cached_file(
pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **cached_file_kwargs
)
if preprocessor_config_file is not None and processor_class is None:
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
pretrained_model_name_or_path, **kwargs
)
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
if processor_class is None:
# Next, let's check whether the processor class is saved in a tokenizer

View File

@ -0,0 +1,384 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""AutoVideoProcessor class."""
import importlib
import json
import os
import warnings
from collections import OrderedDict
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
# Build the list of all video processors
from ...configuration_utils import PretrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
from ...utils import (
CONFIG_NAME,
VIDEO_PROCESSOR_NAME,
cached_file,
is_torchvision_available,
logging,
)
from ...utils.import_utils import requires
from ...video_processing_utils import BaseVideoProcessor
from .auto_factory import _LazyAutoMapping
from .configuration_auto import (
CONFIG_MAPPING_NAMES,
AutoConfig,
model_type_to_module_name,
replace_list_option_in_docstrings,
)
logger = logging.get_logger(__name__)
if TYPE_CHECKING:
# This significantly improves completion suggestion performance when
# the transformers package is used with Microsoft's Pylance language server.
VIDEO_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(
[
("instructblipvideo", "InstructBlipVideoVideoProcessor"),
("llava_next_video", "LlavaNextVideoVideoProcessor"),
("llava_onevision", "LlavaOnevisionVideoProcessor"),
("qwen2_5_vl", "Qwen2_5_VLVideoProcessor"),
("qwen2_vl", "Qwen2VLVideoProcessor"),
("video_llava", "VideoLlavaVideoProcessor"),
]
)
for model_type, video_processors in VIDEO_PROCESSOR_MAPPING_NAMES.items():
fast_video_processor_class = video_processors
# If the torchvision is not available, we set it to None
if not is_torchvision_available():
fast_video_processor_class = None
VIDEO_PROCESSOR_MAPPING_NAMES[model_type] = fast_video_processor_class
VIDEO_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, VIDEO_PROCESSOR_MAPPING_NAMES)
def video_processor_class_from_name(class_name: str):
for module_name, extractors in VIDEO_PROCESSOR_MAPPING_NAMES.items():
if class_name in extractors:
module_name = model_type_to_module_name(module_name)
module = importlib.import_module(f".{module_name}", "transformers.models")
try:
return getattr(module, class_name)
except AttributeError:
continue
for _, extractor in VIDEO_PROCESSOR_MAPPING._extra_content.items():
if getattr(extractor, "__name__", None) == class_name:
return extractor
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# init and we return the proper dummy to get an appropriate error message.
main_module = importlib.import_module("transformers")
if hasattr(main_module, class_name):
return getattr(main_module, class_name)
return None
def get_video_processor_config(
pretrained_model_name_or_path: Union[str, os.PathLike],
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[Dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
**kwargs,
):
"""
Loads the video processor configuration from a pretrained model video processor configuration.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained model configuration hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a configuration file saved using the
[`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
local_files_only (`bool`, *optional*, defaults to `False`):
If `True`, will only try to load the video processor configuration from local files.
<Tip>
Passing `token=True` is required when you want to use a private model.
</Tip>
Returns:
`Dict`: The configuration of the video processor.
Examples:
```python
# Download configuration from huggingface.co and cache.
video_processor_config = get_video_processor_config("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
# This model does not have a video processor config so the result will be an empty dict.
video_processor_config = get_video_processor_config("FacebookAI/xlm-roberta-base")
# Save a pretrained video processor locally and you can reload its config
from transformers import AutoVideoProcessor
video_processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
video_processor.save_pretrained("video-processor-test")
video_processor = get_video_processor_config("video-processor-test")
```"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
token = use_auth_token
resolved_config_file = cached_file(
pretrained_model_name_or_path,
VIDEO_PROCESSOR_NAME,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
)
if resolved_config_file is None:
logger.info(
"Could not locate the video processor configuration file, will try to use the model config instead."
)
return {}
with open(resolved_config_file, encoding="utf-8") as reader:
return json.load(reader)
@requires(backends=("vision", "torchvision"))
class AutoVideoProcessor:
r"""
This is a generic video processor class that will be instantiated as one of the video processor classes of the
library when created with the [`AutoVideoProcessor.from_pretrained`] class method.
This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError(
"AutoVideoProcessor is designed to be instantiated "
"using the `AutoVideoProcessor.from_pretrained(pretrained_model_name_or_path)` method."
)
@classmethod
@replace_list_option_in_docstrings(VIDEO_PROCESSOR_MAPPING_NAMES)
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
r"""
Instantiate one of the video processor classes of the library from a pretrained model vocabulary.
The video processor class to instantiate is selected based on the `model_type` property of the config object
(either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Params:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained video_processor hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a video processor file saved using the
[`~video_processing_utils.BaseVideoProcessor.save_pretrained`] method, e.g.,
`./my_model_directory/`.
- a path or url to a saved video processor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model video processor should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the video processor files and override the cached versions if
they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
If `False`, then this function returns just the final video processor object. If `True`, then this
functions returns a `Tuple(video_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
consisting of the key/value pairs whose keys are not video processor attributes: i.e., the part of
`kwargs` which has not been used to update `video_processor` and is otherwise ignored.
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are video processor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* video processor attributes is
controlled by the `return_unused_kwargs` keyword parameter.
<Tip>
Passing `token=True` is required when you want to use a private model.
</Tip>
Examples:
```python
>>> from transformers import AutoVideoProcessor
>>> # Download video processor from huggingface.co and cache.
>>> video_processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
>>> # If video processor files are in a directory (e.g. video processor was saved using *save_pretrained('./test/saved_model/')*)
>>> # video_processor = AutoVideoProcessor.from_pretrained("./test/saved_model/")
```"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if kwargs.get("token", None) is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
kwargs["token"] = use_auth_token
config = kwargs.pop("config", None)
trust_remote_code = kwargs.pop("trust_remote_code", None)
kwargs["_from_auto"] = True
config_dict, _ = BaseVideoProcessor.get_video_processor_dict(pretrained_model_name_or_path, **kwargs)
video_processor_class = config_dict.get("video_processor_type", None)
video_processor_auto_map = None
if "AutoVideoProcessor" in config_dict.get("auto_map", {}):
video_processor_auto_map = config_dict["auto_map"]["AutoVideoProcessor"]
# If we still don't have the video processor class, check if we're loading from a previous feature extractor config
# and if so, infer the video processor class from there.
if video_processor_class is None and video_processor_auto_map is None:
feature_extractor_class = config_dict.pop("feature_extractor_type", None)
if feature_extractor_class is not None:
video_processor_class = feature_extractor_class.replace("FeatureExtractor", "VideoProcessor")
if "AutoFeatureExtractor" in config_dict.get("auto_map", {}):
feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
video_processor_auto_map = feature_extractor_auto_map.replace("FeatureExtractor", "VideoProcessor")
# If we don't find the video processor class in the video processor config, let's try the model config.
if video_processor_class is None and video_processor_auto_map is None:
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
# It could be in `config.video_processor_type``
video_processor_class = getattr(config, "video_processor_type", None)
if hasattr(config, "auto_map") and "AutoVideoProcessor" in config.auto_map:
video_processor_auto_map = config.auto_map["AutoVideoProcessor"]
if video_processor_class is not None:
video_processor_class = video_processor_class_from_name(video_processor_class)
has_remote_code = video_processor_auto_map is not None
has_local_code = video_processor_class is not None or type(config) in VIDEO_PROCESSOR_MAPPING
trust_remote_code = resolve_trust_remote_code(
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
)
if has_remote_code and trust_remote_code:
class_ref = video_processor_auto_map
video_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
_ = kwargs.pop("code_revision", None)
if os.path.isdir(pretrained_model_name_or_path):
video_processor_class.register_for_auto_class()
return video_processor_class.from_dict(config_dict, **kwargs)
elif video_processor_class is not None:
return video_processor_class.from_dict(config_dict, **kwargs)
# Last try: we use the VIDEO_PROCESSOR_MAPPING.
elif type(config) in VIDEO_PROCESSOR_MAPPING:
video_processor_class = VIDEO_PROCESSOR_MAPPING[type(config)]
if video_processor_class is not None:
return video_processor_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
else:
raise ValueError(
"This video processor cannot be instantiated. Please make sure you have `torchvision` installed."
)
raise ValueError(
f"Unrecognized video processor in {pretrained_model_name_or_path}. Should have a "
f"`video_processor_type` key in its {VIDEO_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following "
f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in VIDEO_PROCESSOR_MAPPING_NAMES.keys())}"
)
@staticmethod
def register(
config_class,
video_processor_class,
exist_ok=False,
):
"""
Register a new video processor for this class.
Args:
config_class ([`PretrainedConfig`]):
The configuration corresponding to the model to register.
video_processor_class ([`BaseVideoProcessor`]):
The video processor to register.
"""
VIDEO_PROCESSOR_MAPPING.register(config_class, video_processor_class, exist_ok=exist_ok)
__all__ = ["VIDEO_PROCESSOR_MAPPING", "AutoVideoProcessor"]

View File

@ -27,22 +27,15 @@ from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
ModelOutput,
SampleTSPredictionOutput,
Seq2SeqTSPredictionOutput,
)
from ...modeling_outputs import BaseModelOutput, ModelOutput, SampleTSPredictionOutput, Seq2SeqTSPredictionOutput
from ...modeling_utils import PreTrainedModel
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from ...utils import auto_docstring, logging
from .configuration_autoformer import AutoformerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "AutoformerConfig"
@dataclass
class AutoFormerDecoderOutput(ModelOutput):
@ -888,6 +881,7 @@ class AutoformerDecoderLayer(nn.Module):
return outputs
@auto_docstring
class AutoformerPreTrainedModel(PreTrainedModel):
config_class = AutoformerConfig
base_model_prefix = "model"
@ -908,154 +902,6 @@ class AutoformerPreTrainedModel(PreTrainedModel):
module.weight.data[module.padding_idx].zero_()
AUTOFORMER_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AutoformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
AUTOFORMER_INPUTS_DOCSTRING = r"""
Args:
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Past values of the time series, that serve as context in order to predict the future. These values may
contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
`static_categorical_features`, `static_real_features`, `past_time_features`).
The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
Missing values need to be replaced with zeros.
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `past_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional time features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
`[0, 1]`:
- 1 for values that are **observed**,
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
Optional static categorical features for which the model will learn an embedding, which it will add to the
values of the time series.
Static categorical features are features which have the same value for all time steps (static over time).
A typical example of a static categorical feature is a time series ID.
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
Optional static real features which the model will add to the values of the time series.
Static real features are features which have the same value for all time steps (static over time).
A typical example of a static real feature is promotion information.
future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
Future values of the time series, that serve as labels for the model. The `future_values` is what the
Transformer needs to learn to output, given the `past_values`.
See the demo notebook and code snippets for details.
Missing values need to be replaced with zeros.
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `future_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
make sure the model can only look at previous inputs in order to predict the future.
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
class AutoformerEncoder(AutoformerPreTrainedModel):
"""
@ -1417,10 +1263,7 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
)
@add_start_docstrings(
"The bare Autoformer Model outputting raw hidden-states without any specific head on top.",
AUTOFORMER_START_DOCSTRING,
)
@auto_docstring
class AutoformerModel(AutoformerPreTrainedModel):
def __init__(self, config: AutoformerConfig):
super().__init__(config)
@ -1595,8 +1438,7 @@ class AutoformerModel(AutoformerPreTrainedModel):
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
past_values: torch.Tensor,
@ -1618,7 +1460,74 @@ class AutoformerModel(AutoformerPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[AutoformerModelOutput, Tuple]:
r"""
Returns:
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Past values of the time series, that serve as context in order to predict the future. These values may
contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
`static_categorical_features`, `static_real_features`, `past_time_features`).
The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
Missing values need to be replaced with zeros.
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `past_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional time features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
`[0, 1]`:
- 1 for values that are **observed**,
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
Optional static categorical features for which the model will learn an embedding, which it will add to the
values of the time series.
Static categorical features are features which have the same value for all time steps (static over time).
A typical example of a static categorical feature is a time series ID.
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
Optional static real features which the model will add to the values of the time series.
Static real features are features which have the same value for all time steps (static over time).
A typical example of a static real feature is promotion information.
future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
Future values of the time series, that serve as labels for the model. The `future_values` is what the
Transformer needs to learn to output, given the `past_values`.
See the demo notebook and code snippets for details.
Missing values need to be replaced with zeros.
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `future_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
Examples:
@ -1753,10 +1662,7 @@ class AutoformerModel(AutoformerPreTrainedModel):
)
@add_start_docstrings(
"The Autoformer Model with a distribution head on top for time-series forecasting.",
AUTOFORMER_START_DOCSTRING,
)
@auto_docstring
class AutoformerForPrediction(AutoformerPreTrainedModel):
def __init__(self, config: AutoformerConfig):
super().__init__(config)
@ -1797,8 +1703,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
sliced_params = [p[:, -trailing_n:] for p in params]
return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
@add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
past_values: torch.Tensor,
@ -1821,7 +1726,82 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Seq2SeqTSPredictionOutput, Tuple]:
r"""
Returns:
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Past values of the time series, that serve as context in order to predict the future. These values may
contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
`static_categorical_features`, `static_real_features`, `past_time_features`).
The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
Missing values need to be replaced with zeros.
past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `past_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional time features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
`[0, 1]`:
- 1 for values that are **observed**,
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
Optional static categorical features for which the model will learn an embedding, which it will add to the
values of the time series.
Static categorical features are features which have the same value for all time steps (static over time).
A typical example of a static categorical feature is a time series ID.
static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
Optional static real features which the model will add to the values of the time series.
Static real features are features which have the same value for all time steps (static over time).
A typical example of a static real feature is promotion information.
future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
Future values of the time series, that serve as labels for the model. The `future_values` is what the
Transformer needs to learn to output, given the `past_values`.
See the demo notebook and code snippets for details.
Missing values need to be replaced with zeros.
future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
Optional time features, which the model internally will add to `future_values`. These could be things like
"month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
could also be so-called "age" features, which basically help the model know "at which point in life" a
time-series is. Age features have small values for distant past time steps and increase monotonically the
more we approach the current time step.
These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
the position encodings are learned from scratch internally as parameters of the model, the Time Series
Transformer requires to provide additional features.
The Autoformer only learns additional embeddings for `static_categorical_features`.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
in `[0, 1]`:
- 1 for values that are **observed**,
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
This mask is used to filter out missing values for the final loss calculation.
Examples:

View File

@ -27,21 +27,15 @@ from torch import nn
from ...activations import ACT2FN
from ...generation import GenerationMixin
from ...modeling_outputs import ModelOutput
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_torchdynamo_compiling,
replace_return_docstrings,
)
from ..auto import AutoModel, AutoModelForCausalLM
from ...processing_utils import Unpack
from ...utils import LossKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
from ..auto import AutoModel
from .configuration_aya_vision import AyaVisionConfig
_CONFIG_FOR_DOC = "AyaVisionConfig"
class AyaVisionMultiModalProjector(nn.Module):
def __init__(self, config: AyaVisionConfig):
super().__init__()
@ -92,38 +86,18 @@ class AyaVisionMultiModalProjector(nn.Module):
return image_features
AYA_VISION_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Aya Vision Model outputting raw hidden-states without any specific head on top.",
AYA_VISION_START_DOCSTRING,
)
@auto_docstring
class AyaVisionPreTrainedModel(PreTrainedModel):
config_class = AyaVisionConfig
base_model_prefix = "model"
base_model_prefix = ""
supports_gradient_checkpointing = True
_no_split_modules = ["AyaVisionVisionAttention"]
_skip_keys_device_placement = "past_key_values"
_supports_cache_class = True
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_quantized_cache = False
_supports_static_cache = False
_supports_attention_backend = True
def _init_weights(self, module):
std = (
@ -169,7 +143,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`torch.FloatTensor`, *optional*):
A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
"""
@ -181,101 +155,53 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
image_hidden_states: Optional[torch.FloatTensor] = None
AYA_VISION_INPUTS_DOCSTRING = """
@dataclass
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
"""
Base class for AyaVision outputs, with hidden states and attentions.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
The tensors corresponding to the input images. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses
[`GotOcr2ImageProcessor`] for processing images.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
The index of the layer to select the vision feature. If multiple indices are provided,
the vision feature of the corresponding indices will be concatenated to form the
vision features.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`torch.FloatTensor`, *optional*):
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
"""
image_hidden_states: Optional[torch.FloatTensor] = None
@add_start_docstrings(
"""The AyaVision model which consists of a vision backbone and a language model.""",
AYA_VISION_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The AyaVision model which consists of a vision backbone and a language model, without a language modeling head.
"""
)
class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixin):
class AyaVisionModel(AyaVisionPreTrainedModel):
_checkpoint_conversion_mapping = {"language_model.model": "language_model"}
def __init__(self, config: AyaVisionConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
self.multi_modal_projector = AyaVisionMultiModalProjector(config)
self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
if self.language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
self.language_model = AutoModel.from_config(config.text_config)
self.post_init()
def get_input_embeddings(self):
@ -284,18 +210,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.language_model.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
def set_decoder(self, decoder):
self.language_model.set_decoder(decoder)
def get_decoder(self):
return self.language_model.get_decoder()
def get_image_features(
self,
pixel_values: torch.FloatTensor,
@ -307,7 +221,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
Obtains image last hidden states from the vision tower and apply multimodal projection.
Args:
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
The tensors corresponding to the input images.
vision_feature_layer (`Union[int, List[int]]`):
The index of the layer to select the vision feature. If multiple indices are provided,
@ -342,8 +256,145 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
image_features = self.multi_modal_projector(selected_image_feature)
return image_features
@add_start_docstrings_to_model_forward(AYA_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AyaVisionCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
vision_feature_layer: Optional[Union[int, List[int]]] = None,
vision_feature_select_strategy: Optional[str] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
image_sizes: torch.Tensor = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[Tuple, AyaVisionModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_feature_layer = (
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
)
vision_feature_select_strategy = (
vision_feature_select_strategy
if vision_feature_select_strategy is not None
else self.config.vision_feature_select_strategy
)
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
if pixel_values is not None:
image_features = self.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
image_sizes=image_sizes,
)
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0]
else:
special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
n_image_tokens = (input_ids == self.config.image_token_id).sum()
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
n_image_tokens = (input_ids == self.config.image_token_id).sum()
n_image_features = image_features.shape[0] * image_features.shape[1]
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs = self.language_model(
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
cache_position=cache_position,
**kwargs,
)
return AyaVisionModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
)
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
@auto_docstring(
custom_intro="""
The AYA_VISION model which consists of a vision backbone and a language model.
"""
)
class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = {
"^language_model.model": "model.language_model",
"^vision_tower": "model.vision_tower",
"^multi_modal_projector": "model.multi_modal_projector",
"^language_model.lm_head": "lm_head",
}
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: AyaVisionConfig):
super().__init__(config)
self.model = AyaVisionModel(config)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.get_input_embeddings()
def set_input_embeddings(self, value):
self.model.set_input_embeddings(value)
def get_output_embeddings(self) -> nn.Module:
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# Make modules available throught conditional class for BC
@property
def language_model(self):
return self.model.language_model
@property
def vision_tower(self):
return self.model.vision_tower
@property
def multi_modal_projector(self):
return self.model.multi_modal_projector
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -362,23 +413,13 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**lm_kwargs,
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
@ -410,7 +451,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
>>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
>>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -425,81 +465,42 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
else self.config.vision_feature_select_strategy
)
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
if pixel_values is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
)
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
if pixel_values is not None:
image_features = self.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
image_sizes=image_sizes,
)
special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
n_image_tokens = (input_ids == self.config.image_token_id).sum()
n_image_features = image_features.shape[0] * image_features.shape[1]
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs = self.language_model(
outputs = self.model(
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
return_dict=True,
cache_position=cache_position,
logits_to_keep=logits_to_keep,
**lm_kwargs,
image_sizes=image_sizes,
**kwargs,
)
logits = outputs[0]
hidden_states = outputs[0]
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
logits = self.lm_head(hidden_states[:, slice_indices, :])
loss = None
if labels is not None:
# Shift so that tokens < n predict n
if attention_mask is not None:
# we use the input attention mask to shift the logits and labels, because it is 2D.
# we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
else:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
loss = self.loss_function(
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return AyaVisionCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
image_hidden_states=outputs.image_hidden_states,
)
def prepare_inputs_for_generation(
@ -515,7 +516,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
):
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
model_inputs = self.language_model.prepare_inputs_for_generation(
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
@ -532,15 +533,60 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
return model_inputs
def tie_weights(self):
return self.language_model.tie_weights()
@staticmethod
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: torch.Tensor,
sequence_length: int,
target_length: int,
dtype: torch.dtype,
cache_position: torch.Tensor,
batch_size: int,
**kwargs,
):
"""
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
# update vocab size
self.config.text_config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings
return model_embeds
Args:
attention_mask (`torch.Tensor`):
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
`(batch_size, 1, query_length, key_value_length)`.
sequence_length (`int`):
The sequence length being processed.
target_length (`int`):
The target length: when generating with static cache, the mask should be as long as the static cache,
to account for the 0 padding, the part of the cache that is not filled yet.
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
Batch size.
"""
if attention_mask is not None and attention_mask.dim() == 4:
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
causal_mask = attention_mask
else:
min_dtype = torch.finfo(dtype).min
causal_mask = torch.full(
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
causal_mask.device
)
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
return causal_mask
__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"]
__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel", "AyaVisionModel"]

View File

@ -20,23 +20,21 @@ import torch
from torch import nn
from transformers.models.llava.modeling_llava import (
KwargsForCausalLM,
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
LlavaPreTrainedModel,
)
from ...activations import ACT2FN
from ...utils import (
add_start_docstrings,
logging,
)
from ...processing_utils import Unpack
from ...utils import logging
from .configuration_aya_vision import AyaVisionConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "AyaVisionConfig"
class AyaVisionMultiModalProjector(nn.Module):
def __init__(self, config: AyaVisionConfig):
@ -88,27 +86,6 @@ class AyaVisionMultiModalProjector(nn.Module):
return image_features
AYA_VISION_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Aya Vision Model outputting raw hidden-states without any specific head on top.",
AYA_VISION_START_DOCSTRING,
)
class AyaVisionPreTrainedModel(LlavaPreTrainedModel):
_supports_quantized_cache = False
_supports_static_cache = False
@ -133,98 +110,11 @@ class AyaVisionCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
pass
AYA_VISION_INPUTS_DOCSTRING = """
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
The tensors corresponding to the input images. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses
[`GotOcr2ImageProcessor`] for processing images.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
The index of the layer to select the vision feature. If multiple indices are provided,
the vision feature of the corresponding indices will be concatenated to form the
vision features.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
class AyaVisionModel(LlavaModel):
pass
@add_start_docstrings(
"""The AyaVision model which consists of a vision backbone and a language model.""",
AYA_VISION_START_DOCSTRING,
)
class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
def tie_weights(self):
return self.language_model.tie_weights()
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
# update vocab size
self.config.text_config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings
return model_embeds
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -243,23 +133,13 @@ class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
image_sizes: Optional[torch.Tensor] = None,
**lm_kwargs,
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:
@ -308,8 +188,8 @@ class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
cache_position=cache_position,
logits_to_keep=logits_to_keep,
image_sizes=image_sizes,
**lm_kwargs,
**kwargs,
)
__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"]
__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel", "AyaVisionModel"]

View File

@ -41,13 +41,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
can_return_tuple,
logging,
replace_return_docstrings,
)
from ...utils import auto_docstring, can_return_tuple, logging
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
from .configuration_bamba import BambaConfig
@ -66,8 +60,6 @@ else:
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BambaConfig"
# Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer
class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache):
@ -854,6 +846,7 @@ class BambaMixer(nn.Module):
# Init cache
if ssm_state is not None and cache_params is not None:
cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
cache_params.has_previous_state = True
scan_output = self.norm(y, gate)
@ -1013,27 +1006,7 @@ class BambaDecoderLayer(nn.Module):
return outputs
BAMBA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BambaConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare BambaModel outputting raw hidden-states without any specific head on top.",
BAMBA_START_DOCSTRING,
)
@auto_docstring
class BambaPreTrainedModel(PreTrainedModel):
config_class = BambaConfig
base_model_prefix = "model"
@ -1063,91 +1036,8 @@ class BambaPreTrainedModel(PreTrainedModel):
module.D.data.fill_(1.0)
BAMBA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the
self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
`(batch_size, d_inner, d_state)` respectively.
See the `HybridMambaAttentionDynamicCache` class for more details.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
output_router_logits (`bool`, *optional*):
Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
should not be returned during inference.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@add_start_docstrings(
"The bare Bamba Model outputting raw hidden-states without any specific head on top.",
BAMBA_START_DOCSTRING,
)
# Adapted from transformers.models.jamba.modeling_jamba.JambaModel
@auto_docstring
class BambaModel(BambaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BambaDecoderLayer`]
Args:
config: BambaConfig
"""
def __init__(self, config: BambaConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
@ -1174,7 +1064,7 @@ class BambaModel(BambaPreTrainedModel):
self.embed_tokens = value
@can_return_tuple
@add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1413,6 +1303,7 @@ class BambaModel(BambaPreTrainedModel):
return mamba_mask
@auto_docstring
class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
@ -1446,8 +1337,7 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
return self.model
@can_return_tuple
@add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1464,19 +1354,10 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
**kwargs,
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:

View File

@ -43,23 +43,11 @@ from transformers.models.mamba2.modeling_mamba2 import (
segment_sum,
)
from ...modeling_attn_mask_utils import (
AttentionMaskConverter,
)
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
can_return_tuple,
logging,
replace_return_docstrings,
)
from ...utils.import_utils import (
is_causal_conv1d_available,
is_flash_attn_2_available,
is_mamba_2_ssm_available,
)
from ...utils import auto_docstring, can_return_tuple, logging
from ...utils.import_utils import is_causal_conv1d_available, is_flash_attn_2_available, is_mamba_2_ssm_available
from .configuration_bamba import BambaConfig
@ -82,8 +70,6 @@ is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_c
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BambaConfig"
# Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer
class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache):
@ -651,6 +637,7 @@ class BambaMixer(nn.Module):
# Init cache
if ssm_state is not None and cache_params is not None:
cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
cache_params.has_previous_state = True
scan_output = self.norm(y, gate)
@ -781,27 +768,7 @@ class BambaDecoderLayer(JambaAttentionDecoderLayer):
return outputs
BAMBA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BambaConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare BambaModel outputting raw hidden-states without any specific head on top.",
BAMBA_START_DOCSTRING,
)
@auto_docstring
class BambaPreTrainedModel(PreTrainedModel):
config_class = BambaConfig
base_model_prefix = "model"
@ -831,91 +798,8 @@ class BambaPreTrainedModel(PreTrainedModel):
module.D.data.fill_(1.0)
BAMBA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the
self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
`(batch_size, d_inner, d_state)` respectively.
See the `HybridMambaAttentionDynamicCache` class for more details.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
output_router_logits (`bool`, *optional*):
Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
should not be returned during inference.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@add_start_docstrings(
"The bare Bamba Model outputting raw hidden-states without any specific head on top.",
BAMBA_START_DOCSTRING,
)
# Adapted from transformers.models.jamba.modeling_jamba.JambaModel
@auto_docstring
class BambaModel(BambaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BambaDecoderLayer`]
Args:
config: BambaConfig
"""
def __init__(self, config: BambaConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
@ -942,7 +826,7 @@ class BambaModel(BambaPreTrainedModel):
self.embed_tokens = value
@can_return_tuple
@add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1182,9 +1066,6 @@ class BambaModel(BambaPreTrainedModel):
class BambaForCausalLM(LlamaForCausalLM):
@can_return_tuple
@add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1201,19 +1082,10 @@ class BambaForCausalLM(LlamaForCausalLM):
**kwargs,
) -> CausalLMOutputWithPast:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:

View File

@ -34,8 +34,7 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
from ...modeling_utils import PreTrainedModel, get_parameter_device
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
auto_docstring,
is_accelerate_available,
is_torch_accelerator_available,
logging,
@ -62,10 +61,6 @@ if is_flash_attn_available():
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "suno/bark-small"
_CONFIG_FOR_DOC = "BarkConfig"
class BarkSelfAttention(nn.Module):
# adapted from GPTNeoSelfAttention and Bark code
# BarkSelfAttention can have two attention type, i.e full attention or causal attention
@ -368,12 +363,8 @@ class BarkBlock(nn.Module):
return outputs # hidden_states, ((present), attentions)
@auto_docstring
class BarkPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BarkConfig
supports_gradient_checkpointing = False
_supports_flash_attn_2 = True
@ -418,134 +409,6 @@ class BarkPreTrainedModel(PreTrainedModel):
return get_parameter_device(self)
BARK_MODEL_START_DOCSTRING = """
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`{config}`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BarkConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_FINE_INPUTS_DOCSTRING = r"""
Args:
codebook_idx (`int`):
Index of the codebook that will be predicted.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
predicted recursively by attending the previously predicted channels. The model predicts on windows of
length 1024.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
`past_key_values` is used, optionally only the last `input_embeds` have to be input (see
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`input_ids` of shape `(batch_size, sequence_length)`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
is used in priority instead of `input_ids`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# GPT2-like autoregressive model
class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
config_class = BarkSubModelConfig
@ -639,7 +502,7 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
"attention_mask": attention_mask,
}
@add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -654,6 +517,13 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]:
r"""
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
is used in priority instead of `input_ids`.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -803,10 +673,11 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
)
@add_start_docstrings(
"""Bark semantic (or text) model. It shares the same architecture as the coarse model.
It is a GPT-2 like autoregressive model with a language modeling head on top.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"),
@auto_docstring(
custom_intro="""
Bark semantic (or text) model. It shares the same architecture as the coarse model.
It is a GPT-2 like autoregressive model with a language modeling head on top.
"""
)
class BarkSemanticModel(BarkCausalModel):
base_model_prefix = "semantic"
@ -912,11 +783,12 @@ class BarkSemanticModel(BarkCausalModel):
return semantic_output
@add_start_docstrings(
"""Bark coarse acoustics model.
@auto_docstring(
custom_intro="""
Bark coarse acoustics model.
It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
language modeling head on top.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"),
language modeling head on top.
"""
)
class BarkCoarseModel(BarkCausalModel):
base_model_prefix = "coarse_acoustics"
@ -1133,10 +1005,11 @@ class BarkCoarseModel(BarkCausalModel):
return coarse_output
@add_start_docstrings(
"""Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
language modeling heads, one for each codebook.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"),
@auto_docstring(
custom_intro="""
Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
language modeling heads, one for each codebook.
"""
)
class BarkFineModel(BarkPreTrainedModel):
base_model_prefix = "fine_acoustics"
@ -1293,7 +1166,7 @@ class BarkFineModel(BarkPreTrainedModel):
if hasattr(module, "_tie_weights"):
module._tie_weights()
@add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
@auto_docstring
def forward(
self,
codebook_idx: int, # an additional idx corresponding to the id of the codebook that will be predicted
@ -1307,6 +1180,17 @@ class BarkFineModel(BarkPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
codebook_idx (`int`):
Index of the codebook that will be predicted.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
NOT IMPLEMENTED YET.
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
`past_key_values` is used, optionally only the last `input_embeds` have to be input (see
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding lookup matrix.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -1541,8 +1425,8 @@ class BarkFineModel(BarkPreTrainedModel):
return fine_input
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
The full Bark model, a text-to-speech model composed of 4 sub-models:
- [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
takes
@ -1557,8 +1441,7 @@ class BarkFineModel(BarkPreTrainedModel):
It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
output sound according to specific predefined voice.
""",
BARK_START_DOCSTRING,
"""
)
class BarkModel(BarkPreTrainedModel):
config_class = BarkConfig

View File

@ -43,14 +43,7 @@ from ...modeling_outputs import (
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils import auto_docstring, logging
from .configuration_bart import BartConfig
@ -60,22 +53,6 @@ if is_flash_attn_available():
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/bart-base"
_CONFIG_FOR_DOC = "BartConfig"
# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
# SequenceClassification docstring
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"
_SEQ_CLASS_EXPECTED_LOSS = 0.0
_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"
# QuestionAsnwering docstring
_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"
_QA_EXPECTED_LOSS = 0.59
_QA_EXPECTED_OUTPUT = "' nice puppet'"
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
@ -739,6 +716,7 @@ class BartClassificationHead(nn.Module):
return hidden_states
@auto_docstring
class BartPreTrainedModel(PreTrainedModel):
config_class = BartConfig
base_model_prefix = "model"
@ -787,163 +765,6 @@ class BartPretrainedModel(BartPreTrainedModel):
)
BART_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BART_GENERATION_EXAMPLE = r"""
Summarization example:
```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
>>> ARTICLE_TO_SUMMARIZE = (
... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
```
Mask filling example:
```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
>>> TXT = "My friends are <mask> but they eat too many carbs."
>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)
>>> tokenizer.decode(predictions).split()
['not', 'good', 'healthy', 'great', 'very']
```
"""
BART_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
input (see `past_key_values`). This is useful if you want more control over how to convert
`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
of `inputs_embeds`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class BartEncoder(BartPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@ -1419,10 +1240,7 @@ class BartDecoder(BartPreTrainedModel):
)
@add_start_docstrings(
"The bare BART Model outputting raw hidden-states without any specific head on top.",
BART_START_DOCSTRING,
)
@auto_docstring
class BartModel(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -1465,13 +1283,7 @@ class BartModel(BartPreTrainedModel):
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1490,6 +1302,35 @@ class BartModel(BartPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqModelOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
"""
# different to other models, Bart automatically creates decoder_input_ids from
# input_ids if no decoder_input_ids are provided
if decoder_input_ids is None and decoder_inputs_embeds is None:
@ -1560,8 +1401,10 @@ class BartModel(BartPreTrainedModel):
)
@add_start_docstrings(
"The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
@auto_docstring(
custom_intro="""
The BART Model with a language modeling head. Can be used for summarization.
"""
)
class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
base_model_prefix = "model"
@ -1610,9 +1453,7 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
self.model._tie_weights()
self._tie_or_clone_weights(self.lm_head, self.model.shared)
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1633,12 +1474,78 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example summarization:
```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
>>> ARTICLE_TO_SUMMARIZE = (
... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
```
Mask filling example:
```python
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
>>> TXT = "My friends are <mask> but they eat too many carbs."
>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)
>>> tokenizer.decode(predictions).split()
['not', 'good', 'healthy', 'great', 'very']
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1709,12 +1616,11 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
return reordered_past
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
BART_START_DOCSTRING,
"""
)
class BartForSequenceClassification(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -1732,14 +1638,7 @@ class BartForSequenceClassification(BartPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1759,6 +1658,33 @@ class BartForSequenceClassification(BartPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
@ -1839,13 +1765,7 @@ class BartForSequenceClassification(BartPreTrainedModel):
)
@add_start_docstrings(
"""
BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BART_START_DOCSTRING,
)
@auto_docstring
class BartForQuestionAnswering(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -1861,14 +1781,7 @@ class BartForQuestionAnswering(BartPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=_QA_EXPECTED_LOSS,
expected_output=_QA_EXPECTED_OUTPUT,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1889,14 +1802,33 @@ class BartForQuestionAnswering(BartPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if start_positions is not None and end_positions is not None:
@ -1978,11 +1910,10 @@ class BartDecoderWrapper(BartPreTrainedModel):
return self.decoder(*args, **kwargs)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
BART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
BART_START_DOCSTRING,
"""
)
class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
@ -2017,7 +1948,7 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2035,72 +1966,15 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:

View File

@ -21,6 +21,7 @@ if TYPE_CHECKING:
from .configuration_beit import *
from .feature_extraction_beit import *
from .image_processing_beit import *
from .image_processing_beit_fast import *
from .modeling_beit import *
from .modeling_flax_beit import *
else:

View File

@ -0,0 +1,287 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for Beit."""
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torchvision.transforms import functional as F
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import (
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
group_images_by_shape,
reorder_images,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
SizeDict,
is_torch_tensor,
make_list_of_images,
pil_torch_interpolation_mapping,
validate_kwargs,
)
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring
class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
ADE20k). The background label will be replaced by 255.
"""
do_reduce_labels: Optional[bool]
@auto_docstring
class BeitImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
image_mean = IMAGENET_STANDARD_MEAN
image_std = IMAGENET_STANDARD_STD
size = {"height": 224, "width": 224}
default_to_square = True
crop_size = {"height": 224, "width": 224}
do_resize = True
do_center_crop = False
do_rescale = True
do_normalize = True
do_reduce_labels = False
valid_kwargs = BeitFastImageProcessorKwargs
def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
super().__init__(**kwargs)
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs
"""
image_processor_dict = image_processor_dict.copy()
if "reduce_labels" in image_processor_dict:
image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels")
return super().from_dict(image_processor_dict, **kwargs)
def reduce_label(self, labels: list["torch.Tensor"]):
for idx in range(len(labels)):
label = labels[idx]
label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label)
label = label - 1
label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label)
labels[idx] = label
return label
def _preprocess(
self,
images: list["torch.Tensor"],
do_reduce_labels: bool,
do_resize: bool,
size: SizeDict,
interpolation: Optional["F.InterpolationMode"],
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
return_tensors: Optional[Union[str, TensorType]],
**kwargs,
) -> BatchFeature:
if do_reduce_labels:
images = self.reduce_label(images)
# Group images by size for batched resizing
grouped_images, grouped_images_index = group_images_by_shape(images)
resized_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
resized_images_grouped[shape] = stacked_images
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
# Group images by size for further processing
# Needed in case do_resize is False, or resize returns images with different sizes
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
processed_images_grouped = {}
for shape, stacked_images in grouped_images.items():
if do_center_crop:
stacked_images = self.center_crop(stacked_images, crop_size)
# Fused rescale and normalize
stacked_images = self.rescale_and_normalize(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return processed_images
def _preprocess_images(
self,
images,
**kwargs,
):
"""Preprocesses images."""
kwargs["do_reduce_labels"] = False
processed_images = self._preprocess(images=images, **kwargs)
return processed_images
def _preprocess_segmentation_maps(
self,
segmentation_maps,
**kwargs,
):
"""Preprocesses segmentation maps."""
processed_segmentation_maps = []
for segmentation_map in segmentation_maps:
segmentation_map = self._process_image(
segmentation_map, do_convert_rgb=False, input_data_format=ChannelDimension.FIRST
)
if segmentation_map.ndim == 2:
segmentation_map = segmentation_map[None, ...]
processed_segmentation_maps.append(segmentation_map)
kwargs["do_normalize"] = False
kwargs["do_rescale"] = False
kwargs["input_data_format"] = ChannelDimension.FIRST
processed_segmentation_maps = self._preprocess(images=processed_segmentation_maps, **kwargs)
processed_segmentation_maps = processed_segmentation_maps.squeeze(1)
processed_segmentation_maps = processed_segmentation_maps.to(torch.int64)
return processed_segmentation_maps
def __call__(self, images, segmentation_maps=None, **kwargs):
# Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
# be passed in as positional arguments.
return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
@auto_docstring
def preprocess(
self,
images: ImageInput,
segmentation_maps: Optional[ImageInput] = None,
**kwargs: Unpack[BeitFastImageProcessorKwargs],
) -> BatchFeature:
r"""
segmentation_maps (`ImageInput`, *optional*):
The segmentation maps to preprocess.
"""
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
# Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_kwargs.__annotations__:
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
# Extract parameters that are only used for preparing the input images
do_convert_rgb = kwargs.pop("do_convert_rgb")
input_data_format = kwargs.pop("input_data_format")
device = kwargs.pop("device")
# Prepare input images
images = self._prepare_input_images(
images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
)
# Prepare segmentation maps
if segmentation_maps is not None:
segmentation_maps = make_list_of_images(images=segmentation_maps, expected_ndims=2)
# Update kwargs that need further processing before being validated
kwargs = self._further_process_kwargs(**kwargs)
# Validate kwargs
self._validate_preprocess_kwargs(**kwargs)
# torch resize uses interpolation instead of resample
resample = kwargs.pop("resample")
kwargs["interpolation"] = (
pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
)
# Pop kwargs that are not needed in _preprocess
kwargs.pop("default_to_square")
kwargs.pop("data_format")
images = self._preprocess_images(
images=images,
**kwargs,
)
data = {"pixel_values": images}
if segmentation_maps is not None:
segmentation_maps = self._preprocess_segmentation_maps(
segmentation_maps=segmentation_maps,
**kwargs,
)
data["labels"] = segmentation_maps
return BatchFeature(data=data)
def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[List[Tuple]] = None):
"""
Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`BeitForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
predictions will not be resized.
Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
# TODO: add support for other frameworks
logits = outputs.logits
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
semantic_segmentation = []
for idx in range(len(logits)):
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
__all__ = ["BeitImageProcessorFast"]

View File

@ -36,14 +36,7 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import compile_compatible_method_lru_cache, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
torch_int,
)
from ...utils import auto_docstring, logging, torch_int
from ...utils.backbone_utils import BackboneMixin
from .configuration_beit import BeitConfig
@ -51,10 +44,8 @@ from .configuration_beit import BeitConfig
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "BeitConfig"
# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k"
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
# Image classification docstring
@ -741,12 +732,8 @@ class BeitEncoder(nn.Module):
)
@auto_docstring
class BeitPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BeitConfig
base_model_prefix = "beit"
main_input_name = "pixel_values"
@ -784,48 +771,13 @@ class BeitPreTrainedModel(PreTrainedModel):
module.lambda_2.data.fill_(self.config.layer_scale_init_value)
BEIT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BEIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`BeitImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Beit Model transformer outputting raw hidden-states without any specific head on top.",
BEIT_START_DOCSTRING,
)
@auto_docstring
class BeitModel(BeitPreTrainedModel):
def __init__(self, config: BeitConfig, add_pooling_layer: bool = True) -> None:
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
@ -851,14 +803,7 @@ class BeitModel(BeitPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BeitModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
@auto_docstring
def forward(
self,
pixel_values: torch.Tensor,
@ -933,12 +878,13 @@ class BeitPooler(nn.Module):
return pooled_output
@add_start_docstrings(
"""Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
@auto_docstring(
custom_intro="""
Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.""",
BEIT_START_DOCSTRING,
will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
"""
)
class BeitForMaskedImageModeling(BeitPreTrainedModel):
def __init__(self, config: BeitConfig) -> None:
@ -954,8 +900,7 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
@ -970,14 +915,11 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
Examples:
```python
@ -1035,12 +977,11 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
hidden states of the patch tokens) e.g. for ImageNet.
""",
BEIT_START_DOCSTRING,
"""
)
class BeitForImageClassification(BeitPreTrainedModel):
def __init__(self, config: BeitConfig) -> None:
@ -1055,13 +996,7 @@ class BeitForImageClassification(BeitPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
@ -1361,12 +1296,7 @@ class BeitFCNHead(nn.Module):
return output
@add_start_docstrings(
"""
Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
""",
BEIT_START_DOCSTRING,
)
@auto_docstring
class BeitForSemanticSegmentation(BeitPreTrainedModel):
def __init__(self, config: BeitConfig) -> None:
super().__init__(config)
@ -1419,8 +1349,7 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
return loss
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
@ -1436,8 +1365,6 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
Returns:
Examples:
```python
@ -1514,11 +1441,10 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
BEiT backbone, to be used with frameworks like DETR and MaskFormer.
""",
BEIT_START_DOCSTRING,
"""
)
class BeitBackbone(BeitPreTrainedModel, BackboneMixin):
def __init__(self, config):
@ -1554,8 +1480,7 @@ class BeitBackbone(BeitPreTrainedModel, BackboneMixin):
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
pixel_values: Tensor,
@ -1563,9 +1488,7 @@ class BeitBackbone(BeitPreTrainedModel, BackboneMixin):
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
"""
Returns:
r"""
Examples:
```python

View File

@ -29,10 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...generation import GenerationMixin
from ...modeling_attn_mask_utils import (
_prepare_4d_attention_mask_for_sdpa,
_prepare_4d_causal_attention_mask_for_sdpa,
)
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
@ -46,42 +43,12 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
get_torch_version,
logging,
replace_return_docstrings,
)
from ...utils import ModelOutput, auto_docstring, get_torch_version, logging
from .configuration_bert import BertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
_CONFIG_FOR_DOC = "BertConfig"
# TokenClassification docstring
_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
_TOKEN_CLASS_EXPECTED_OUTPUT = (
"['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] "
)
_TOKEN_CLASS_EXPECTED_LOSS = 0.01
# QuestionAnswering docstring
_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
_QA_EXPECTED_OUTPUT = "'a nice puppet'"
_QA_EXPECTED_LOSS = 7.41
_QA_TARGET_START_INDEX = 14
_QA_TARGET_END_INDEX = 15
# SequenceClassification docstring
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
_SEQ_CLASS_EXPECTED_LOSS = 0.01
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
@ -821,12 +788,8 @@ class BertPreTrainingHeads(nn.Module):
return prediction_scores, seq_relationship_score
@auto_docstring
class BertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertConfig
load_tf_weights = load_tf_weights_in_bert
base_model_prefix = "bert"
@ -886,79 +849,8 @@ class BertForPreTrainingOutput(ModelOutput):
attentions: Optional[Tuple[torch.FloatTensor]] = None
BERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING,
)
class BertModel(BertPreTrainedModel):
"""
@auto_docstring(
custom_intro="""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
@ -968,10 +860,15 @@ class BertModel(BertPreTrainedModel):
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
"""
)
class BertModel(BertPreTrainedModel):
_no_split_modules = ["BertEmbeddings", "BertLayer"]
def __init__(self, config, add_pooling_layer=True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.config = config
@ -1000,12 +897,7 @@ class BertModel(BertPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1022,26 +914,6 @@ class BertModel(BertPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -1169,12 +1041,11 @@ class BertModel(BertPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
sentence prediction (classification)` head.
""",
BERT_START_DOCSTRING,
"""
)
class BertForPreTraining(BertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
@ -1195,8 +1066,7 @@ class BertForPreTraining(BertPreTrainedModel):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1212,20 +1082,16 @@ class BertForPreTraining(BertPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Example:
@ -1280,8 +1146,10 @@ class BertForPreTraining(BertPreTrainedModel):
)
@add_start_docstrings(
"""Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
@auto_docstring(
custom_intro="""
Bert Model with a `language modeling` head on top for CLM fine-tuning.
"""
)
class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
@ -1305,12 +1173,7 @@ class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1330,28 +1193,10 @@ class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
**loss_kwargs,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
@ -1402,7 +1247,7 @@ class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
return reordered_past
@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
@auto_docstring
class BertForMaskedLM(BertPreTrainedModel):
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
@ -1428,14 +1273,7 @@ class BertForMaskedLM(BertPreTrainedModel):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'paris'",
expected_loss=0.88,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1518,9 +1356,10 @@ class BertForMaskedLM(BertPreTrainedModel):
return False
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top.""",
BERT_START_DOCSTRING,
@auto_docstring(
custom_intro="""
Bert Model with a `next sentence prediction (classification)` head on top.
"""
)
class BertForNextSentencePrediction(BertPreTrainedModel):
def __init__(self, config):
@ -1532,8 +1371,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1556,8 +1394,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
Example:
```python
@ -1620,12 +1456,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
BERT_START_DOCSTRING,
"""
)
class BertForSequenceClassification(BertPreTrainedModel):
def __init__(self, config):
@ -1643,14 +1478,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1723,13 +1551,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
)
@add_start_docstrings(
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
BERT_START_DOCSTRING,
)
@auto_docstring
class BertForMultipleChoice(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -1744,12 +1566,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1764,6 +1581,30 @@ class BertForMultipleChoice(BertPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
@ -1817,13 +1658,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
)
@add_start_docstrings(
"""
Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BERT_START_DOCSTRING,
)
@auto_docstring
class BertForTokenClassification(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -1839,14 +1674,7 @@ class BertForTokenClassification(BertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1900,13 +1728,7 @@ class BertForTokenClassification(BertPreTrainedModel):
)
@add_start_docstrings(
"""
Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BERT_START_DOCSTRING,
)
@auto_docstring
class BertForQuestionAnswering(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -1918,16 +1740,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=_QA_TARGET_START_INDEX,
qa_target_end_index=_QA_TARGET_END_INDEX,
expected_output=_QA_EXPECTED_OUTPUT,
expected_loss=_QA_EXPECTED_LOSS,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -1942,16 +1755,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(

View File

@ -27,20 +27,14 @@ from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, Causa
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
auto_docstring,
logging,
replace_return_docstrings,
)
from .configuration_bert_generation import BertGenerationConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/bert_for_seq_generation_L-24_bbc_encoder"
_CONFIG_FOR_DOC = "BertGenerationConfig"
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BertGeneration
class BertGenerationSelfOutput(nn.Module):
@ -583,12 +577,8 @@ class BertGenerationEmbeddings(nn.Module):
return embeddings
@auto_docstring
class BertGenerationPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertGenerationConfig
base_model_prefix = "bert"
supports_gradient_checkpointing = True
@ -612,67 +602,10 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
BERT_GENERATION_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BertGenerationConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BERT_GENERATION_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare BertGeneration model transformer outputting raw hidden-states without any specific head on top.",
BERT_GENERATION_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The bare BertGeneration model transformer outputting raw hidden-states without any specific head on top.
"""
)
class BertGenerationEncoder(BertGenerationPreTrainedModel):
"""
@ -715,12 +648,7 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -737,24 +665,6 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
return_dict: Optional[bool] = None,
**kwargs, # NOOP kwargs, for now
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: `1` for
tokens that are NOT MASKED, `0` for MASKED tokens.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -860,9 +770,10 @@ class BertGenerationOnlyLMHead(nn.Module):
self.bias = self.decoder.bias
@add_start_docstrings(
"""BertGeneration Model with a `language modeling` head on top for CLM fine-tuning.""",
BERT_GENERATION_START_DOCSTRING,
@auto_docstring(
custom_intro="""
BertGeneration Model with a `language modeling` head on top for CLM fine-tuning.
"""
)
class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
@ -886,8 +797,7 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin):
self.lm_head.decoder = new_embeddings
self.lm_head.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -906,30 +816,10 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin):
**kwargs,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
Returns:
Example:

View File

@ -38,22 +38,12 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils import ModelOutput, auto_docstring, logging
from .configuration_big_bird import BigBirdConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base"
_CONFIG_FOR_DOC = "BigBirdConfig"
_TRIVIA_QA_MAPPING = {
"big_bird_attention": "attention/self",
@ -1743,12 +1733,8 @@ class BigBirdPreTrainingHeads(nn.Module):
return prediction_scores, seq_relationship_score
@auto_docstring
class BigBirdPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BigBirdConfig
load_tf_weights = load_tf_weights_in_big_bird
base_model_prefix = "bert"
@ -1773,67 +1759,6 @@ class BigBirdPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
BIG_BIRD_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIG_BIRD_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@dataclass
class BigBirdForPreTrainingOutput(ModelOutput):
"""
@ -1903,10 +1828,7 @@ class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
attentions: Optional[Tuple[torch.FloatTensor]] = None
@add_start_docstrings(
"The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.",
BIG_BIRD_START_DOCSTRING,
)
@auto_docstring
class BigBirdModel(BigBirdPreTrainedModel):
"""
@ -1921,6 +1843,10 @@ class BigBirdModel(BigBirdPreTrainedModel):
"""
def __init__(self, config, add_pooling_layer=True):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
self.attention_type = self.config.attention_type
self.config = config
@ -1964,12 +1890,7 @@ class BigBirdModel(BigBirdPreTrainedModel):
self.attention_type = value
self.encoder.set_attention_type(value)
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -1987,25 +1908,6 @@ class BigBirdModel(BigBirdPreTrainedModel):
return_dict: Optional[bool] = None,
**kwargs, # NOOP kwargs, for now
) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple[torch.FloatTensor]]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -2266,8 +2168,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2294,10 +2195,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:
Example:
@ -2353,7 +2250,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
)
@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
@auto_docstring
class BigBirdForMaskedLM(BigBirdPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
@ -2379,8 +2276,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2402,8 +2298,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
@ -2496,8 +2390,10 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
return {"input_ids": input_ids, "attention_mask": attention_mask}
@add_start_docstrings(
"""BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
@auto_docstring(
custom_intro="""
BigBird Model with a `language modeling` head on top for CLM fine-tuning.
"""
)
class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
@ -2521,12 +2417,7 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2546,27 +2437,10 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
**kwargs,
) -> Union[CausalLMOutputWithCrossAttentions, Tuple[torch.FloatTensor]]:
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
`[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -2646,12 +2520,11 @@ class BigBirdClassificationHead(nn.Module):
return x
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
BIG_BIRD_START_DOCSTRING,
"""
)
class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
def __init__(self, config):
@ -2664,8 +2537,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2685,8 +2557,6 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
Example:
```python
@ -2774,13 +2644,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
)
@add_start_docstrings(
"""
BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
BIG_BIRD_START_DOCSTRING,
)
@auto_docstring
class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -2792,14 +2656,7 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(
BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2814,6 +2671,30 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]:
r"""
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
@ -2867,13 +2748,7 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
)
@add_start_docstrings(
"""
BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BIG_BIRD_START_DOCSTRING,
)
@auto_docstring
class BigBirdForTokenClassification(BigBirdPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -2889,12 +2764,7 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2966,15 +2836,13 @@ class BigBirdForQuestionAnsweringHead(nn.Module):
return hidden_states
@add_start_docstrings(
"""
BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BIG_BIRD_START_DOCSTRING,
)
@auto_docstring
class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
def __init__(self, config, add_pooling_layer=False):
r"""
add_pooling_layer (bool, *optional*, defaults to `True`):
Whether to add a pooling layer
"""
super().__init__(config)
config.num_labels = 2
@ -2987,13 +2855,12 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BigBirdForQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
question_lengths: Optional[torch.Tensor] = None,
question_lengths: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
@ -3005,16 +2872,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[BigBirdForQuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
Returns:
question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
The lengths of the questions in the batch.
Example:

View File

@ -36,21 +36,12 @@ from ...modeling_outputs import (
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils import auto_docstring, logging
from .configuration_bigbird_pegasus import BigBirdPegasusConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/bigbird-pegasus-large-arxiv"
_CONFIG_FOR_DOC = "BigBirdPegasusConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 1024]
@ -1564,6 +1555,7 @@ class BigBirdPegasusClassificationHead(nn.Module):
return hidden_states
@auto_docstring
class BigBirdPegasusPreTrainedModel(PreTrainedModel):
config_class = BigBirdPegasusConfig
base_model_prefix = "model"
@ -1594,149 +1586,6 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
return dummy_inputs
BIGBIRD_PEGASUS_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BigBirdPegasusConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
Summarization example:
```python
>>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration
>>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> ARTICLE_TO_SUMMARIZE = (
... "The dominant sequence transduction models are based on complex recurrent or convolutional neural "
... "networks in an encoder-decoder configuration. The best performing models also connect the encoder "
... "and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, "
... "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
... "Experiments on two machine translation tasks show these models to be superior in quality "
... "while being more parallelizable and requiring significantly less time to train."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True)
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=15)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'dominant sequence models are based on recurrent or convolutional neural networks .'
```
"""
BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the `input_ids` to the right, following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read
[`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in
[the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
input (see `past_key_values`). This is useful if you want more control over how to convert
`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
of `inputs_embeds`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BIGBIRD_PEGASUS_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@ -2293,10 +2142,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
)
@add_start_docstrings(
"The bare BigBirdPegasus Model outputting raw hidden-states without any specific head on top.",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
@auto_docstring
class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -2334,14 +2180,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
# Copied from transformers.models.bart.modeling_bart.BartModel.forward with Bart->BigBirdPegasus
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2360,6 +2199,23 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqModelOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the `input_ids` to the right, following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read
[`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in
[the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
"""
# different to other models, BigBirdPegasus automatically creates decoder_input_ids from
# input_ids if no decoder_input_ids are provided
if decoder_input_ids is None and decoder_inputs_embeds is None:
@ -2430,9 +2286,10 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
)
@add_start_docstrings(
"The BigBirdPegasus Model with a language modeling head. Can be used for summarization.",
BIGBIRD_PEGASUS_START_DOCSTRING,
@auto_docstring(
custom_intro="""
The BigBirdPegasus Model with a language modeling head. Can be used for summarization.
"""
)
# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, GenerationMixin):
@ -2482,9 +2339,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
self.model._tie_weights()
self._tie_or_clone_weights(self.lm_head, self.model.shared)
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BIGBIRD_PEGASUS_GENERATION_EXAMPLE)
@auto_docstring
# Ignore copy
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2505,12 +2361,49 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the `input_ids` to the right, following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read
[`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in
[the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example summarization:
```python
>>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration
>>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
>>> ARTICLE_TO_SUMMARIZE = (
... "The dominant sequence transduction models are based on complex recurrent or convolutional neural "
... "networks in an encoder-decoder configuration. The best performing models also connect the encoder "
... "and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, "
... "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. "
... "Experiments on two machine translation tasks show these models to be superior in quality "
... "while being more parallelizable and requiring significantly less time to train."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True)
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=15)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'dominant sequence models are based on recurrent or convolutional neural networks .'
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -2581,12 +2474,11 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
return reordered_past
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
BigBirdPegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g.
for GLUE tasks.
""",
BIGBIRD_PEGASUS_START_DOCSTRING,
"""
)
class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -2604,13 +2496,7 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
# Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2630,6 +2516,21 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the `input_ids` to the right, following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read
[`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in
[the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
@ -2710,13 +2611,7 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
)
@add_start_docstrings(
"""
BigBirdPegasus Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BIGBIRD_PEGASUS_START_DOCSTRING,
)
@auto_docstring
class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
@ -2732,13 +2627,7 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
@ -2759,14 +2648,21 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the `input_ids` to the right, following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read
[`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in
[the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if start_positions is not None and end_positions is not None:
@ -2882,7 +2778,7 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -2900,72 +2796,15 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Example:

View File

@ -33,9 +33,7 @@ from ...modeling_outputs import (
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
auto_docstring,
logging,
)
from .configuration_biogpt import BioGptConfig
@ -43,9 +41,6 @@ from .configuration_biogpt import BioGptConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/biogpt"
_CONFIG_FOR_DOC = "BioGptConfig"
# copied from transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding with OPT->BioGpt
# TODO @ArthurZucker bring copied from back
@ -444,12 +439,8 @@ class BioGptDecoderLayer(nn.Module):
return outputs
@auto_docstring
class BioGptPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BioGptConfig
base_model_prefix = "biogpt"
supports_gradient_checkpointing = True
@ -472,76 +463,7 @@ class BioGptPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0)
BIOGPT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIOGPT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.",
BIOGPT_START_DOCSTRING,
)
@auto_docstring
class BioGptModel(BioGptPreTrainedModel):
def __init__(self, config: BioGptConfig):
super().__init__(config)
@ -571,12 +493,7 @@ class BioGptModel(BioGptPreTrainedModel):
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -719,8 +636,10 @@ class BioGptModel(BioGptPreTrainedModel):
)
@add_start_docstrings(
"""BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
@auto_docstring(
custom_intro="""
BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
"""
)
class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["output_projection.weight"]
@ -740,12 +659,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
def set_output_embeddings(self, new_embeddings):
self.output_projection = new_embeddings
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -815,13 +729,7 @@ class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
return reordered_past
@add_start_docstrings(
"""
BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BIOGPT_START_DOCSTRING,
)
@auto_docstring
class BioGptForTokenClassification(BioGptPreTrainedModel):
def __init__(self, config):
super().__init__(config)
@ -837,12 +745,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
self.post_init()
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -907,8 +810,8 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
)
@add_start_docstrings(
"""
@auto_docstring(
custom_intro="""
The BioGpt Model transformer with a sequence classification head on top (linear layer).
[`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
@ -919,8 +822,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
BIOGPT_START_DOCSTRING,
"""
)
class BioGptForSequenceClassification(BioGptPreTrainedModel):
def __init__(self, config: BioGptConfig):
@ -932,12 +834,7 @@ class BioGptForSequenceClassification(BioGptPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,

View File

@ -14,18 +14,12 @@
# limitations under the License.
"""Fast Image processor class for BiT."""
from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BaseImageProcessorFast,
)
from ...image_processing_utils_fast import BaseImageProcessorFast
from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
from ...utils import add_start_docstrings
from ...utils import auto_docstring
@add_start_docstrings(
"Constructs a fast Bit image processor.",
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
)
@auto_docstring
class BitImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN

Some files were not shown because too many files have changed in this diff Show More