No more Tuple, List, Dict (#38797)

* No more Tuple, List, Dict

* make fixup

* More style fixes

* Docstring fixes with regex replacement

* Trigger tests

* Redo fixes after rebase

* Fix copies

* [test all]

* update

* [test all]

* update

* [test all]

* make style after rebase

* Patch the hf_argparser test

* Patch the hf_argparser test

* style fixes

* style fixes

* style fixes

* Fix docstrings in Cohere test

* [test all]

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Matt 2025-06-17 19:37:18 +01:00 committed by GitHub
parent a396f4324b
commit 508a704055
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1291 changed files with 14906 additions and 14941 deletions

View File

@ -28,7 +28,7 @@ class MetricsRecorder:
self.commit_id = commit_id self.commit_id = commit_id
self.commit_msg = commit_msg self.commit_msg = commit_msg
def initialise_benchmark(self, metadata: Dict[str, str]) -> int: def initialise_benchmark(self, metadata: dict[str, str]) -> int:
""" """
Creates a new benchmark, returns the benchmark id Creates a new benchmark, returns the benchmark id
""" """
@ -55,7 +55,7 @@ class MetricsRecorder:
f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]" f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
) )
def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]): def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute( cur.execute(
""" """
@ -85,7 +85,7 @@ handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
def parse_arguments() -> Tuple[str, str, str, str]: def parse_arguments() -> tuple[str, str, str, str]:
""" """
Parse command line arguments for the benchmarking CLI. Parse command line arguments for the benchmarking CLI.
""" """

View File

@ -278,7 +278,7 @@ Here's an example of a single value return:
```python ```python
Returns: Returns:
`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token. `list[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
``` ```
Here's an example of a tuple return, comprising several objects: Here's an example of a tuple return, comprising several objects:

View File

@ -30,7 +30,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -571,7 +571,7 @@ The processor should call the appropriate modality-specific processors within it
def __call__( def __call__(
self, self,
images: ImageInput = None, images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio=None, audio=None,
videos=None, videos=None,
**kwargs: Unpack[YourModelProcessorKwargs], **kwargs: Unpack[YourModelProcessorKwargs],

View File

@ -92,7 +92,7 @@ def custom_attention(
a_new_kwargs = None, # You can now add as many kwargs as you need a_new_kwargs = None, # You can now add as many kwargs as you need
another_new_kwargs = None, # You can now add as many kwargs as you need another_new_kwargs = None, # You can now add as many kwargs as you need
**kwargs, # You need to accept **kwargs as models will pass other args **kwargs, # You need to accept **kwargs as models will pass other args
) -> Tuple[torch.Tensor, Optional[torch.Tensor]] ) -> tuple[torch.Tensor, Optional[torch.Tensor]]
... # do your magic! ... # do your magic!
return attn_output, attn_weights # attn_weights are optional here return attn_output, attn_weights # attn_weights are optional here

View File

@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -152,7 +152,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. | | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. | | `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. | | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. | | `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
## Pitfalls ## Pitfalls

View File

@ -62,11 +62,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_) box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
# encode(tokenize) each word from words (List[str]) # encode(tokenize) each word from words (list[str])
input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words] input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
# get the length of each box # get the length of each box
tokens_length_list: List[int] = [len(l) for l in input_ids_list] tokens_length_list: list[int] = [len(l) for l in input_ids_list]
box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list))) box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
box_start_token_indices = box_end_token_indices - np.array(tokens_length_list) box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)

View File

@ -149,7 +149,7 @@ As a summary, consider the following table:
| **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image | | **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] | | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | | | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) | | **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `list[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` | | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |

View File

@ -83,7 +83,7 @@ def read_video_pyav(container, indices):
Decode the video with PyAV decoder. Decode the video with PyAV decoder.
Args: Args:
container (`av.container.input.InputContainer`): PyAV container. container (`av.container.input.InputContainer`): PyAV container.
indices (`List[int]`): List of frame indices to decode. indices (`list[int]`): List of frame indices to decode.
Returns: Returns:
result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
''' '''

View File

@ -216,12 +216,12 @@ class Olmo2Attention(OlmoAttention):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)
@ -294,9 +294,9 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs, **kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states residual = hidden_states
# Self Attention # Self Attention
@ -494,7 +494,7 @@ class LlamaForCausalLM(nn.Module):
input_ids: torch.LongTensor = None, input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
@ -520,7 +520,7 @@ class NewModelForCausalLM(LlamaForCausalLM): | class LlamaForCausalLM(nn.M
| input_ids: torch.LongTensor = None, | input_ids: torch.LongTensor = None,
| attention_mask: Optional[torch.Tensor] = None, | attention_mask: Optional[torch.Tensor] = None,
| position_ids: Optional[torch.LongTensor] = None, | position_ids: Optional[torch.LongTensor] = None,
| past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None, | past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = |None,
| inputs_embeds: Optional[torch.FloatTensor] = None, | inputs_embeds: Optional[torch.FloatTensor] = None,
| labels: Optional[torch.LongTensor] = None, | labels: Optional[torch.LongTensor] = None,
| use_cache: Optional[bool] = None, | use_cache: Optional[bool] = None,

View File

@ -170,7 +170,7 @@ Unlike other data collators, this specific data collator needs to apply a differ
... processor: AutoProcessor ... processor: AutoProcessor
... padding: Union[bool, str] = "longest" ... padding: Union[bool, str] = "longest"
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... # split inputs and labels since they have to be of different lengths and need ... # split inputs and labels since they have to be of different lengths and need
... # different padding methods ... # different padding methods
... input_features = [{"input_values": feature["input_values"][0]} for feature in features] ... input_features = [{"input_values": feature["input_values"][0]} for feature in features]

View File

@ -243,7 +243,7 @@ and it uses the exact same dataset as an example. Apply some geometric and color
... ) ... )
``` ```
The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`, The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': list[Dict]}`,
where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example: where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
```py ```py
@ -252,9 +252,9 @@ The `image_processor` expects the annotations to be in the following format: `{'
... Args: ... Args:
... image_id (str): image id. e.g. "0001" ... image_id (str): image id. e.g. "0001"
... categories (List[int]): list of categories/class labels corresponding to provided bounding boxes ... categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
... areas (List[float]): list of corresponding areas to provided bounding boxes ... areas (list[float]): list of corresponding areas to provided bounding boxes
... bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format ... bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
... ([center_x, center_y, width, height] in absolute coordinates) ... ([center_x, center_y, width, height] in absolute coordinates)
... Returns: ... Returns:
@ -397,7 +397,7 @@ Intermediate format of boxes used for training is `YOLO` (normalized) but we wil
... Args: ... Args:
... boxes (torch.Tensor): Bounding boxes in YOLO format ... boxes (torch.Tensor): Bounding boxes in YOLO format
... image_size (Tuple[int, int]): Image size in format (height, width) ... image_size (tuple[int, int]): Image size in format (height, width)
... Returns: ... Returns:
... torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max) ... torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)

View File

@ -408,7 +408,7 @@ instructs the model to ignore that part of the spectrogram when calculating the
... class TTSDataCollatorWithPadding: ... class TTSDataCollatorWithPadding:
... processor: Any ... processor: Any
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... input_ids = [{"input_ids": feature["input_ids"]} for feature in features] ... input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
... label_features = [{"input_values": feature["labels"]} for feature in features] ... label_features = [{"input_values": feature["labels"]} for feature in features]
... speaker_features = [feature["speaker_embeddings"] for feature in features] ... speaker_features = [feature["speaker_embeddings"] for feature in features]

View File

@ -48,7 +48,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -166,7 +166,7 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de
... processor: AutoProcessor ... processor: AutoProcessor
... padding: Union[bool, str] = "longest" ... padding: Union[bool, str] = "longest"
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y ... # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
... # requieren métodos de padding diferentes ... # requieren métodos de padding diferentes
... input_features = [{"input_values": feature["input_values"][0]} for feature in features] ... input_features = [{"input_values": feature["input_values"][0]} for feature in features]

View File

@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -56,7 +56,7 @@ Optunaに関しては、[object_parameter](https://optuna.readthedocs.io/en/stab
... } ... }
``` ```
Optunaは、多目的のハイパーパラメータ最適化HPOを提供しています。 `hyperparameter_search``direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front`List[BestRun]`)は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。 Optunaは、多目的のハイパーパラメータ最適化HPOを提供しています。 `hyperparameter_search``direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front`list[BestRun]`)は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。
```py ```py

View File

@ -57,11 +57,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_) box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
# encode(tokenize) each word from words (List[str]) # encode(tokenize) each word from words (list[str])
input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words] input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
# get the length of each box # get the length of each box
tokens_length_list: List[int] = [len(l) for l in input_ids_list] tokens_length_list: list[int] = [len(l) for l in input_ids_list]
box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list))) box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
box_start_token_indices = box_end_token_indices - np.array(tokens_length_list) box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)

View File

@ -149,7 +149,7 @@ DETR モデルをインスタンス化するには 3 つの方法があります
| **Description** |画像内のオブジェクトの周囲の境界ボックスとクラス ラベルを予測する | 画像内のオブジェクト (つまりインスタンス) の周囲のマスクを予測する | 画像内のオブジェクト (インスタンス) と「もの」 (木や道路などの背景) の両方の周囲のマスクを予測します | | **Description** |画像内のオブジェクトの周囲の境界ボックスとクラス ラベルを予測する | 画像内のオブジェクト (つまりインスタンス) の周囲のマスクを予測する | 画像内のオブジェクト (インスタンス) と「もの」 (木や道路などの背景) の両方の周囲のマスクを予測します |
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] | | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | | | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) | | **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `list[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] | | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` | | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |

View File

@ -170,7 +170,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
... processor: AutoProcessor ... processor: AutoProcessor
... padding: Union[bool, str] = "longest" ... padding: Union[bool, str] = "longest"
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... # split inputs and labels since they have to be of different lengths and need ... # split inputs and labels since they have to be of different lengths and need
... # different padding methods ... # different padding methods
... input_features = [{"input_values": feature["input_values"][0]} for feature in features] ... input_features = [{"input_values": feature["input_values"][0]} for feature in features]

View File

@ -208,7 +208,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
... ) ... )
``` ```
`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': List[Dict]}`, `image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': list[Dict]}`,
ここで、各辞書は COCO オブジェクトの注釈です。 1 つの例として、注釈を再フォーマットする関数を追加してみましょう。 ここで、各辞書は COCO オブジェクトの注釈です。 1 つの例として、注釈を再フォーマットする関数を追加してみましょう。
```py ```py

View File

@ -408,7 +408,7 @@ Y 軸が反転され、スペクトログラムが上下逆に表示されます
... class TTSDataCollatorWithPadding: ... class TTSDataCollatorWithPadding:
... processor: Any ... processor: Any
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... input_ids = [{"input_ids": feature["input_ids"]} for feature in features] ... input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
... label_features = [{"input_values": feature["labels"]} for feature in features] ... label_features = [{"input_values": feature["labels"]} for feature in features]
... speaker_features = [feature["speaker_embeddings"] for feature in features] ... speaker_features = [feature["speaker_embeddings"] for feature in features]

View File

@ -46,7 +46,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -172,7 +172,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
... processor: AutoProcessor ... processor: AutoProcessor
... padding: Union[bool, str] = "longest" ... padding: Union[bool, str] = "longest"
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... # 입력과 레이블을 분할합니다 ... # 입력과 레이블을 분할합니다
... # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다 ... # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다
... input_features = [{"input_values": feature["input_values"][0]} for feature in features] ... input_features = [{"input_values": feature["input_values"][0]} for feature in features]

View File

@ -201,7 +201,7 @@ DatasetDict({
... ) ... )
``` ```
이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': List[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다: 이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': list[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
```py ```py
>>> def formatted_anns(image_id, category, area, bbox): >>> def formatted_anns(image_id, category, area, bbox):

View File

@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
block_type="bottleneck", block_type="bottleneck",
layers: List[int] = [3, 4, 6, 3], layers: list[int] = [3, 4, 6, 3],
num_classes: int = 1000, num_classes: int = 1000,
input_channels: int = 3, input_channels: int = 3,
cardinality: int = 1, cardinality: int = 1,

View File

@ -56,7 +56,7 @@ pip install optuna/sigopt/wandb/ray[tune]
... } ... }
``` ```
Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数,并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front`List[BestRun]`),您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容: Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数,并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front`list[BestRun]`),您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容:
```py ```py
>>> best_trials = trainer.hyperparameter_search( >>> best_trials = trainer.hyperparameter_search(

View File

@ -181,7 +181,7 @@ Wav2Vec2 分词器仅训练了大写字符,因此您需要确保文本与分
... processor: AutoProcessor ... processor: AutoProcessor
... padding: Union[bool, str] = "longest" ... padding: Union[bool, str] = "longest"
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: ... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
... # split inputs and labels since they have to be of different lengths and need ... # split inputs and labels since they have to be of different lengths and need
... # different padding methods ... # different padding methods
... input_features = [{"input_values": feature["input_values"][0]} for feature in features] ... input_features = [{"input_values": feature["input_values"][0]} for feature in features]

View File

@ -47,7 +47,7 @@ def postprocess_qa_predictions(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):

View File

@ -184,7 +184,7 @@ class Seq2SeqTrainer(Trainer):
Args: Args:
model (:obj:`nn.Module`): model (:obj:`nn.Module`):
The model to evaluate. The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): inputs (:obj:`dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model. The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
@ -193,7 +193,7 @@ class Seq2SeqTrainer(Trainer):
Whether or not to return the loss only. Whether or not to return the loss only.
Return: Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
A tuple with the loss, logits and labels (each being optional). A tuple with the loss, logits and labels (each being optional).
""" """
inputs = self._prepare_inputs(inputs) inputs = self._prepare_inputs(inputs)

View File

@ -530,7 +530,7 @@ def calculate_rouge(
on multi sentence summaries (CNN/DM dataset). on multi sentence summaries (CNN/DM dataset).
Returns: Returns:
Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
""" """
scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer) scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)

View File

@ -91,11 +91,11 @@ class MyNewModelConfig(PretrainedConfig):
`beta_slow` (`float`, *optional*): `beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1. ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*): `short_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (< Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2 size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*): `long_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (< Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2 size divided by the number of attention heads divided by 2

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_new_imgproc_model.py file directly. One of our CI enforces this. # modular_new_imgproc_model.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Dict, List, Optional, Union from typing import Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -57,11 +57,11 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
do_normalize (`bool`, *optional*, defaults to `True`): do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method. overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method.
@ -74,13 +74,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
def __init__( def __init__(
self, self,
do_resize: bool = True, do_resize: bool = True,
size: Optional[Dict[str, int]] = None, size: Optional[dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True, do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255, rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, list[float]]] = None,
do_convert_rgb: bool = True, do_convert_rgb: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
@ -101,7 +101,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Dict[str, int], size: dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -113,7 +113,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
@ -151,13 +151,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
self, self,
images: ImageInput, images: ImageInput,
do_resize: Optional[bool] = None, do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None, size: Optional[dict[str, int]] = None,
resample: PILImageResampling = None, resample: PILImageResampling = None,
do_rescale: Optional[bool] = None, do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None, rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, list[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
do_convert_rgb: Optional[bool] = None, do_convert_rgb: Optional[bool] = None,
data_format: ChannelDimension = ChannelDimension.FIRST, data_format: ChannelDimension = ChannelDimension.FIRST,
@ -172,7 +172,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
passing in images with pixel values between 0 and 1, set `do_rescale=False`. passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`): do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`): size (`dict[str, int]`, *optional*, defaults to `self.size`):
Controls the size of the image after `resize`. The shortest edge of the image is resized to Controls the size of the image after `resize`. The shortest edge of the image is resized to
`size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
@ -185,9 +185,9 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
Rescale factor to rescale the image by if `do_rescale` is set to `True`. Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image. Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to normalize the image by if `do_normalize` is set to `True`. Image mean to normalize the image by if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to normalize the image by if `do_normalize` is set to `True`. Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB. Whether to convert the image to RGB.

View File

@ -5,7 +5,7 @@
# modular_add_function.py file directly. One of our CI enforces this. # modular_add_function.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Note that zamba does not have the `apply_rotary_pos_emb` function! # Note that zamba does not have the `apply_rotary_pos_emb` function!
from typing import Optional, Tuple from typing import Optional
import torch import torch
from torch import nn from torch import nn
@ -62,5 +62,5 @@ class TestAttention(nn.Module):
def __init__(self): def __init__(self):
pass pass
def forward(self) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: def forward(self) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
_ = apply_rotary_pos_emb(1, 1, 1, 1) _ = apply_rotary_pos_emb(1, 1, 1, 1)

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_dummy.py file directly. One of our CI enforces this. # modular_dummy.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -210,12 +210,12 @@ class DummyAttention(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)
@ -278,9 +278,9 @@ class DummyDecoderLayer(GradientCheckpointingLayer):
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.input_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states)

View File

@ -6,7 +6,7 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import math import math
import os import os
from typing import Optional, Tuple, Union from typing import Optional, Union
import torch import torch
from packaging import version from packaging import version
@ -136,9 +136,9 @@ class DummyBertSelfAttention(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
mixed_query_layer = self.query(hidden_states) mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys # If this is instantiated as a cross-attention module, the keys
@ -245,9 +245,9 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
# TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
logger.warning_once( logger.warning_once(
@ -386,9 +386,9 @@ class DummyBertAttention(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
self_outputs = self.self( self_outputs = self.self(
hidden_states, hidden_states,
attention_mask, attention_mask,
@ -454,9 +454,9 @@ class DummyBertLayer(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2 # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention( self_attention_outputs = self.attention(
@ -532,12 +532,12 @@ class DummyBertEncoder(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False, output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True, return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_from_uppercase_model.py file directly. One of our CI enforces this. # modular_from_uppercase_model.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -71,7 +71,7 @@ class FromUppercaseModelAttention(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None, causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel""" """Input shape: Batch x Time x Channel"""
batch_size, seq_length, embed_dim = hidden_states.shape batch_size, seq_length, embed_dim = hidden_states.shape
@ -153,7 +153,7 @@ class FromUppercaseModelEncoderLayer(nn.Module):
attention_mask: torch.Tensor, attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]: ) -> tuple[torch.FloatTensor]:
""" """
Args: Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_multimodal1.py file directly. One of our CI enforces this. # modular_multimodal1.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -210,12 +210,12 @@ class Multimodal1TextAttention(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)
@ -278,9 +278,9 @@ class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.input_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states)

View File

@ -5,7 +5,7 @@
# modular_multimodal2.py file directly. One of our CI enforces this. # modular_multimodal2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -81,7 +81,7 @@ class Multimodal2VisionAttention(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None, causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel""" """Input shape: Batch x Time x Channel"""
batch_size, seq_length, embed_dim = hidden_states.shape batch_size, seq_length, embed_dim = hidden_states.shape
@ -177,7 +177,7 @@ class Multimodal2Attention(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None, causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel""" """Input shape: Batch x Time x Channel"""
batch_size, seq_length, embed_dim = hidden_states.shape batch_size, seq_length, embed_dim = hidden_states.shape
@ -244,7 +244,7 @@ class Multimodal2VisionEncoderLayer(nn.Module):
attention_mask: torch.Tensor, attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]: ) -> tuple[torch.FloatTensor]:
""" """
Args: Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_my_new_model2.py file directly. One of our CI enforces this. # modular_my_new_model2.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, List, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -208,12 +208,12 @@ class MyNewModel2Attention(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)
@ -276,9 +276,9 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.input_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states)
@ -469,7 +469,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
input_ids: Optional[torch.LongTensor] = None, input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,

View File

@ -5,7 +5,7 @@
# modular_new_task_model.py file directly. One of our CI enforces this. # modular_new_task_model.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from dataclasses import dataclass from dataclasses import dataclass
from typing import ClassVar, List, Optional, Tuple, Union from typing import ClassVar, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -88,9 +88,9 @@ class NewTaskModelCausalLMOutputWithPast(ModelOutput):
loss: Optional[torch.FloatTensor] = None loss: Optional[torch.FloatTensor] = None
logits: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[torch.FloatTensor] = None image_hidden_states: Optional[torch.FloatTensor] = None
@ -249,7 +249,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
pixel_values: torch.FloatTensor = None, pixel_values: torch.FloatTensor = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
token_type_ids: Optional[torch.LongTensor] = None, token_type_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None,
@ -259,7 +259,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Union[Tuple, NewTaskModelModelOutputWithPast]: ) -> Union[tuple, NewTaskModelModelOutputWithPast]:
r""" r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@ -442,7 +442,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
num_logits_to_keep: int = 0, num_logits_to_keep: int = 0,
) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]: ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
r""" r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,

View File

@ -6,7 +6,7 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import math import math
import os import os
from typing import List, Optional, Tuple, Union from typing import Optional, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
@ -139,9 +139,9 @@ class RobertaSelfAttention(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
mixed_query_layer = self.query(hidden_states) mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys # If this is instantiated as a cross-attention module, the keys
@ -248,9 +248,9 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
# TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
logger.warning_once( logger.warning_once(
@ -389,9 +389,9 @@ class RobertaAttention(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
self_outputs = self.self( self_outputs = self.self(
hidden_states, hidden_states,
attention_mask, attention_mask,
@ -457,9 +457,9 @@ class RobertaLayer(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2 # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention( self_attention_outputs = self.attention(
@ -535,12 +535,12 @@ class RobertaEncoder(nn.Module):
head_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False, output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True, return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@ -903,12 +903,12 @@ class RobertaModel(RobertaPreTrainedModel):
inputs_embeds: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None, past_key_values: Optional[list[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
r""" r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if

View File

@ -4,7 +4,7 @@
# the file from the modular. If any change should be done, please apply the change to the # the file from the modular. If any change should be done, please apply the change to the
# modular_super.py file directly. One of our CI enforces this. # modular_super.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from typing import Callable, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
from torch import nn from torch import nn
@ -211,12 +211,12 @@ class SuperAttention(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)
@ -279,9 +279,9 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
output_attentions: Optional[bool] = False, output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False, use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states residual = hidden_states
hidden_states = self.input_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states)

View File

@ -5,7 +5,7 @@
# modular_switch_function.py file directly. One of our CI enforces this. # modular_switch_function.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Note that llama and cohere have different definitions for rotate_half # Note that llama and cohere have different definitions for rotate_half
from typing import Callable, Optional, Tuple from typing import Callable, Optional
import torch import torch
from torch import nn from torch import nn
@ -123,12 +123,12 @@ class SwitchFunctionAttention(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
position_embeddings: Tuple[torch.Tensor, torch.Tensor], position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor],
past_key_value: Optional[Cache] = None, past_key_value: Optional[Cache] = None,
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[FlashAttentionKwargs], **kwargs: Unpack[FlashAttentionKwargs],
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
input_shape = hidden_states.shape[:-1] input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim) hidden_shape = (*input_shape, -1, self.head_dim)

View File

@ -7,7 +7,7 @@
import math import math
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Tuple, Union from typing import Optional, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
@ -43,7 +43,7 @@ class MultiScaleDeformableAttention(nn.Module):
self, self,
value: Tensor, value: Tensor,
value_spatial_shapes: Tensor, value_spatial_shapes: Tensor,
value_spatial_shapes_list: List[Tuple], value_spatial_shapes_list: list[tuple],
level_start_index: Tensor, level_start_index: Tensor,
sampling_locations: Tensor, sampling_locations: Tensor,
attention_weights: Tensor, attention_weights: Tensor,
@ -124,9 +124,9 @@ class TestDetrDecoderOutput(ModelOutput):
last_hidden_state: Optional[torch.FloatTensor] = None last_hidden_state: Optional[torch.FloatTensor] = None
intermediate_hidden_states: Optional[torch.FloatTensor] = None intermediate_hidden_states: Optional[torch.FloatTensor] = None
intermediate_reference_points: Optional[torch.FloatTensor] = None intermediate_reference_points: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[tuple[torch.FloatTensor]] = None
@dataclass @dataclass
@ -177,12 +177,12 @@ class TestDetrModelOutput(ModelOutput):
last_hidden_state: Optional[torch.FloatTensor] = None last_hidden_state: Optional[torch.FloatTensor] = None
intermediate_hidden_states: Optional[torch.FloatTensor] = None intermediate_hidden_states: Optional[torch.FloatTensor] = None
intermediate_reference_points: Optional[torch.FloatTensor] = None intermediate_reference_points: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional[torch.FloatTensor] = None enc_outputs_class: Optional[torch.FloatTensor] = None
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
@ -557,7 +557,7 @@ class TestDetrMultiheadAttention(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_embeddings: Optional[torch.Tensor] = None, position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False, output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel""" """Input shape: Batch x Time x Channel"""
batch_size, target_len, embed_dim = hidden_states.size() batch_size, target_len, embed_dim = hidden_states.size()
@ -1431,7 +1431,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
Args: Args:
enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
Returns: Returns:
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
@ -1499,7 +1499,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], TestDetrModelOutput]: ) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
r""" r"""
Returns: Returns:

View File

@ -33,7 +33,7 @@ import logging
import os import os
from collections.abc import Iterable from collections.abc import Iterable
from contextlib import nullcontext from contextlib import nullcontext
from typing import Dict, Optional from typing import Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@ -589,7 +589,7 @@ class ContextParallelCollator:
def __init__(self, cp_mesh: Optional[DeviceMesh] = None): def __init__(self, cp_mesh: Optional[DeviceMesh] = None):
self.cp_mesh = cp_mesh self.cp_mesh = cp_mesh
def __call__(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: def __call__(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
batch = default_collate(batch) batch = default_collate(batch)
if self.cp_mesh is not None and self.cp_mesh.size() > 1: if self.cp_mesh is not None and self.cp_mesh.size() > 1:
# Get sequence length from the input batch # Get sequence length from the input batch

View File

@ -66,9 +66,9 @@ def format_image_annotations_as_coco(
Args: Args:
image_id (str): image id. e.g. "0001" image_id (str): image id. e.g. "0001"
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
areas (List[float]): list of corresponding areas to provided bounding boxes areas (list[float]): list of corresponding areas to provided bounding boxes
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
([center_x, center_y, width, height] in absolute coordinates) ([center_x, center_y, width, height] in absolute coordinates)
Returns: Returns:
@ -101,7 +101,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
Args: Args:
boxes (torch.Tensor): Bounding boxes in YOLO format boxes (torch.Tensor): Bounding boxes in YOLO format
image_size (Tuple[int, int]): Image size in format (height, width) image_size (tuple[int, int]): Image size in format (height, width)
Returns: Returns:
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max) torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)

View File

@ -67,9 +67,9 @@ def format_image_annotations_as_coco(
Args: Args:
image_id (str): image id. e.g. "0001" image_id (str): image id. e.g. "0001"
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
areas (List[float]): list of corresponding areas to provided bounding boxes areas (list[float]): list of corresponding areas to provided bounding boxes
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
([center_x, center_y, width, height] in absolute coordinates) ([center_x, center_y, width, height] in absolute coordinates)
Returns: Returns:
@ -103,7 +103,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
Args: Args:
boxes (torch.Tensor): Bounding boxes in YOLO format boxes (torch.Tensor): Bounding boxes in YOLO format
image_size (Tuple[int, int]): Image size in format (height, width) image_size (tuple[int, int]): Image size in format (height, width)
Returns: Returns:
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max) torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)

View File

@ -47,7 +47,7 @@ def postprocess_qa_predictions(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):

View File

@ -47,7 +47,7 @@ def postprocess_qa_predictions(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
Args: Args:
examples: The non-preprocessed dataset (see the main script for more information). examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information). features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`. first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):

View File

@ -22,7 +22,8 @@ line-length = 119
ignore = ["C901", "E501", "E741", "F402", "F823" ] ignore = ["C901", "E501", "E741", "F402", "F823" ]
# RUF013: Checks for the use of implicit Optional # RUF013: Checks for the use of implicit Optional
# in type annotations when the default parameter value is None. # in type annotations when the default parameter value is None.
select = ["C", "E", "F", "I", "W", "RUF013"] select = ["C", "E", "F", "I", "W", "RUF013", "UP006"]
extend-safe-fixes = ["UP006"]
# Ignore import violations in all `__init__.py` files. # Ignore import violations in all `__init__.py` files.
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]

View File

@ -19,7 +19,7 @@ and remove unnecessary dependencies.
import os import os
import warnings import warnings
from io import BytesIO from io import BytesIO
from typing import List, Optional, Tuple, Union from typing import Optional, Union
import numpy as np import numpy as np
import requests import requests
@ -70,7 +70,7 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
AudioInput = Union[ AudioInput = Union[
np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"] # noqa: F821 np.ndarray, "torch.Tensor", list[np.ndarray], tuple[np.ndarray], list["torch.Tensor"], tuple["torch.Tensor"] # noqa: F821
] ]
@ -88,7 +88,7 @@ def make_list_of_audio(
""" """
Ensure that the output is a list of audio. Ensure that the output is a list of audio.
Args: Args:
audio (`Union[List[AudioInput], AudioInput]`): audio (`Union[list[AudioInput], AudioInput]`):
The input audio. The input audio.
Returns: Returns:
list: A list of audio. list: A list of audio.
@ -246,7 +246,7 @@ def chroma_filter_bank(
Tuning deviation from A440 in fractions of a chroma bin. Tuning deviation from A440 in fractions of a chroma bin.
power (`float`, *optional*, defaults to 2.0): power (`float`, *optional*, defaults to 2.0):
If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm. If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`): weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
the second element being the Gaussian half-width. the second element being the Gaussian half-width.
start_at_c_chroma (`float`, *optional*, defaults to `True`): start_at_c_chroma (`float`, *optional*, defaults to `True`):
@ -733,7 +733,7 @@ def spectrogram_batch(
Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`. Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
Args: Args:
waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`): waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
The list of input waveforms, each a single-channel (mono) signal. The list of input waveforms, each a single-channel (mono) signal.
window (`np.ndarray` of shape `(frame_length,)`): window (`np.ndarray` of shape `(frame_length,)`):
The windowing function to apply, including zero-padding if necessary. The windowing function to apply, including zero-padding if necessary.
@ -775,7 +775,7 @@ def spectrogram_batch(
Data type of the output spectrogram. Data type of the output spectrogram.
Returns: Returns:
List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform. list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
""" """
window_length = len(window) window_length = len(window)

View File

@ -4,7 +4,7 @@ import json
import os import os
from collections.abc import Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Optional, Union
import torch import torch
from packaging import version from packaging import version
@ -28,7 +28,7 @@ def _static_cache_update(
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
cache_position: Optional[torch.LongTensor], cache_position: Optional[torch.LongTensor],
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the static cache tensors in place. Updates the static cache tensors in place.
@ -41,7 +41,7 @@ def _static_cache_update(
If None, the entire cache is overwritten (prefill). If None, the entire cache is overwritten (prefill).
Returns: Returns:
Tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place). tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place).
""" """
if cache_position is None: if cache_position is None:
# Prefill phase where seq_len potentially equals max_cache_len. Directly copy. # Prefill phase where seq_len potentially equals max_cache_len. Directly copy.
@ -67,7 +67,7 @@ def _sliding_cache_update(
value_states: torch.Tensor, value_states: torch.Tensor,
cache_position: torch.LongTensor, cache_position: torch.LongTensor,
max_cache_len: int, max_cache_len: int,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the sliding window cache tensors, returning the potentially modified tensors. Updates the sliding window cache tensors, returning the potentially modified tensors.
@ -80,7 +80,7 @@ def _sliding_cache_update(
max_cache_len (`int`): The maximum length of the sliding window cache. max_cache_len (`int`): The maximum length of the sliding window cache.
Returns: Returns:
Tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update. tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update.
For prefill > window, these are the full input states. For prefill > window, these are the full input states.
Otherwise, they are the updated cache tensors. Otherwise, they are the updated cache tensors.
""" """
@ -134,8 +134,8 @@ class Cache:
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
@ -146,7 +146,7 @@ class Cache:
The new value states to cache. The new value states to cache.
layer_idx (`int`): layer_idx (`int`):
The index of the layer to cache the states for. The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`): cache_kwargs (`dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. These are specific to each subclass and allow new types of Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
cache to be created. cache to be created.
@ -222,7 +222,7 @@ class CacheConfig:
""" """
Constructs a CacheConfig instance from a dictionary of parameters. Constructs a CacheConfig instance from a dictionary of parameters.
Args: Args:
config_dict (Dict[str, Any]): Dictionary containing configuration parameters. config_dict (dict[str, Any]): Dictionary containing configuration parameters.
**kwargs: Additional keyword arguments to override dictionary values. **kwargs: Additional keyword arguments to override dictionary values.
Returns: Returns:
@ -257,10 +257,10 @@ class CacheConfig:
writer.write(json_string) writer.write(json_string)
# Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> dict[str, Any]:
""" """
Serializes this instance to a Python dictionary. Returns: Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
""" """
return copy.deepcopy(self.__dict__) return copy.deepcopy(self.__dict__)
@ -289,11 +289,11 @@ class CacheConfig:
returning all the unused kwargs. returning all the unused kwargs.
Args: Args:
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Dictionary of attributes to tentatively update this class. Dictionary of attributes to tentatively update this class.
Returns: Returns:
`Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance. `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
""" """
to_remove = [] to_remove = []
for key, value in kwargs.items(): for key, value in kwargs.items():
@ -473,8 +473,8 @@ class DynamicCache(Cache):
def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None: def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
super().__init__() super().__init__()
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
# `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121 # `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121
# and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the # and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the
@ -487,7 +487,7 @@ class DynamicCache(Cache):
self.key_cache.append(key_states) self.key_cache.append(key_states)
self.value_cache.append(value_states) self.value_cache.append(value_states)
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]: def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
sequence length. sequence length.
@ -517,8 +517,8 @@ class DynamicCache(Cache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
@ -529,7 +529,7 @@ class DynamicCache(Cache):
The new value states to cache. The new value states to cache.
layer_idx (`int`): layer_idx (`int`):
The index of the layer to cache the states for. The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`): cache_kwargs (`dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
Return: Return:
@ -574,7 +574,7 @@ class DynamicCache(Cache):
"""Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length.""" """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
return None return None
def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
"""Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
backward compatibility.""" backward compatibility."""
legacy_cache = () legacy_cache = ()
@ -584,7 +584,7 @@ class DynamicCache(Cache):
@classmethod @classmethod
def from_legacy_cache( def from_legacy_cache(
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor, torch.FloatTensor]]] = None cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor, torch.FloatTensor]]] = None
) -> "DynamicCache": ) -> "DynamicCache":
"""Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
backward compatibility.""" backward compatibility."""
@ -611,7 +611,7 @@ class DynamicCache(Cache):
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :] self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
self.value_cache[idx] = self.value_cache[idx][..., :max_length, :] self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]: def batch_split(self, full_batch_size: int, split_size: int) -> list["DynamicCache"]:
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
`_split_model_inputs()` in `generation.utils`""" `_split_model_inputs()` in `generation.utils`"""
out = [] out = []
@ -624,7 +624,7 @@ class DynamicCache(Cache):
return out return out
@classmethod @classmethod
def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache": def from_batch_splits(cls, splits: list["DynamicCache"]) -> "DynamicCache":
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
`generation.utils`""" `generation.utils`"""
cache = cls() cache = cls()
@ -762,7 +762,7 @@ class OffloadedCache(DynamicCache):
self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True) self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True) self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]: def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
"Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer." "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
if layer_idx < len(self): if layer_idx < len(self):
# Evict the previous layer if necessary # Evict the previous layer if necessary
@ -799,8 +799,8 @@ class OffloadedCache(DynamicCache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
Parameters: Parameters:
@ -810,7 +810,7 @@ class OffloadedCache(DynamicCache):
The new value states to cache. The new value states to cache.
layer_idx (`int`): layer_idx (`int`):
The index of the layer to cache the states for. The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`): cache_kwargs (`dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`. Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`.
Return: Return:
A tuple containing the updated key and value states. A tuple containing the updated key and value states.
@ -857,8 +857,8 @@ class QuantizedCache(DynamicCache):
def __init__(self, cache_config: QuantizedCacheConfig) -> None: def __init__(self, cache_config: QuantizedCacheConfig) -> None:
super().__init__() super().__init__()
self._quantized_key_cache: List[torch.Tensor] = [] self._quantized_key_cache: list[torch.Tensor] = []
self._quantized_value_cache: List[torch.Tensor] = [] self._quantized_value_cache: list[torch.Tensor] = []
self.nbits = cache_config.nbits self.nbits = cache_config.nbits
self.residual_length = cache_config.residual_length self.residual_length = cache_config.residual_length
@ -875,8 +875,8 @@ class QuantizedCache(DynamicCache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# Update the number of seen tokens # Update the number of seen tokens
if layer_idx == 0: if layer_idx == 0:
self._seen_tokens += key_states.shape[-2] self._seen_tokens += key_states.shape[-2]
@ -1094,7 +1094,7 @@ class StaticCache(Cache):
should pass the `layer_device_map` argument instead. should pass the `layer_device_map` argument instead.
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer. The default `dtype` to use when initializing the layer.
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`. checking the associated device_map: `model.hf_device_map`.
@ -1129,7 +1129,7 @@ class StaticCache(Cache):
max_cache_len: Optional[int] = None, max_cache_len: Optional[int] = None,
device: Union[torch.device, str, None] = None, device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.float32, dtype: torch.dtype = torch.float32,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.max_batch_size = max_batch_size self.max_batch_size = max_batch_size
@ -1145,8 +1145,8 @@ class StaticCache(Cache):
else config.num_key_value_heads else config.num_key_value_heads
) )
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
# Note: There will be significant perf decrease if switching to use 5D tensors instead. # Note: There will be significant perf decrease if switching to use 5D tensors instead.
cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
device = torch.device(device) if device is not None else None device = torch.device(device) if device is not None else None
@ -1169,8 +1169,8 @@ class StaticCache(Cache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
It is VERY important to index using a tensor, otherwise you introduce a copy to the device. It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
@ -1182,7 +1182,7 @@ class StaticCache(Cache):
The new value states to cache. The new value states to cache.
layer_idx (`int`): layer_idx (`int`):
The index of the layer to cache the states for. The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`): cache_kwargs (`dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
to know how where to write in the cache. to know how where to write in the cache.
@ -1260,7 +1260,7 @@ class SlidingWindowCache(StaticCache):
should pass the `layer_device_map` argument instead. should pass the `layer_device_map` argument instead.
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer. The default `dtype` to use when initializing the layer.
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`. checking the associated device_map: `model.hf_device_map`.
@ -1294,7 +1294,7 @@ class SlidingWindowCache(StaticCache):
max_cache_len: Optional[int] = None, max_cache_len: Optional[int] = None,
device: Union[torch.device, str, None] = None, device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.float32, dtype: torch.dtype = torch.float32,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
) -> None: ) -> None:
if not hasattr(config, "sliding_window") or config.sliding_window is None: if not hasattr(config, "sliding_window") or config.sliding_window is None:
raise ValueError( raise ValueError(
@ -1318,8 +1318,8 @@ class SlidingWindowCache(StaticCache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if cache_kwargs is None: if cache_kwargs is None:
cache_kwargs = {} cache_kwargs = {}
cache_position = cache_kwargs.get("cache_position") cache_position = cache_kwargs.get("cache_position")
@ -1400,7 +1400,7 @@ class EncoderDecoderCache(Cache):
for layer_idx in range(len(cross_attention_cache.key_cache)): for layer_idx in range(len(cross_attention_cache.key_cache)):
self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0) self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
""" """
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
sequence length. sequence length.
@ -1422,7 +1422,7 @@ class EncoderDecoderCache(Cache):
""" """
return len(self.self_attention_cache) return len(self.self_attention_cache)
def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor]]: def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
"""Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
legacy_cache = () legacy_cache = ()
if len(self.cross_attention_cache) > 0: if len(self.cross_attention_cache) > 0:
@ -1436,7 +1436,7 @@ class EncoderDecoderCache(Cache):
@classmethod @classmethod
def from_legacy_cache( def from_legacy_cache(
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
) -> "EncoderDecoderCache": ) -> "EncoderDecoderCache":
"""Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
cache = cls( cache = cls(
@ -1495,7 +1495,7 @@ class EncoderDecoderCache(Cache):
self.check_dynamic_cache(self.crop.__name__) self.check_dynamic_cache(self.crop.__name__)
self.self_attention_cache.crop(maximum_length) self.self_attention_cache.crop(maximum_length)
def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]": def batch_split(self, full_batch_size: int, split_size: int) -> "list[EncoderDecoderCache]":
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
`_split_model_inputs()` in `generation.utils`""" `_split_model_inputs()` in `generation.utils`"""
self.check_dynamic_cache(self.batch_split.__name__) self.check_dynamic_cache(self.batch_split.__name__)
@ -1508,7 +1508,7 @@ class EncoderDecoderCache(Cache):
return out return out
@classmethod @classmethod
def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache": def from_batch_splits(cls, splits: list["EncoderDecoderCache"]) -> "EncoderDecoderCache":
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
`generation.utils`""" `generation.utils`"""
self_attention_cache = DynamicCache() self_attention_cache = DynamicCache()
@ -1569,7 +1569,7 @@ class HybridCache(Cache):
should pass the `layer_device_map` argument instead. should pass the `layer_device_map` argument instead.
dtype (torch.dtype, *optional*, defaults to `torch.float32`): dtype (torch.dtype, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer. The default `dtype` to use when initializing the layer.
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`. checking the associated device_map: `model.hf_device_map`.
@ -1603,7 +1603,7 @@ class HybridCache(Cache):
max_cache_len: Optional[int] = None, max_cache_len: Optional[int] = None,
device: Union[torch.device, str, None] = None, device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.float32, dtype: torch.dtype = torch.float32,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
if not hasattr(config, "sliding_window") or config.sliding_window is None: if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1634,8 +1634,8 @@ class HybridCache(Cache):
else: else:
self.is_sliding = [False] * config.num_hidden_layers self.is_sliding = [False] * config.num_hidden_layers
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
global_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) global_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.sliding_window_len, self.head_dim) sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.sliding_window_len, self.head_dim)
self.sliding_window = min(config.sliding_window, max_cache_len) self.sliding_window = min(config.sliding_window, max_cache_len)
@ -1660,8 +1660,8 @@ class HybridCache(Cache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if cache_kwargs is None: if cache_kwargs is None:
cache_kwargs = {} cache_kwargs = {}
cache_position = cache_kwargs.get("cache_position") cache_position = cache_kwargs.get("cache_position")
@ -1757,7 +1757,7 @@ class HybridChunkedCache(Cache):
should pass the `layer_device_map` argument instead. should pass the `layer_device_map` argument instead.
dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`): dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`):
The default `dtype` to use when initializing the layer. The default `dtype` to use when initializing the layer.
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`. checking the associated device_map: `model.hf_device_map`.
@ -1791,7 +1791,7 @@ class HybridChunkedCache(Cache):
max_cache_len: Optional[int] = None, max_cache_len: Optional[int] = None,
device: Union[torch.device, str, None] = None, device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.bfloat16, dtype: torch.dtype = torch.bfloat16,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
) -> None: ) -> None:
super().__init__() super().__init__()
if not hasattr(config, "sliding_window") or config.sliding_window is None: if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1811,8 +1811,8 @@ class HybridChunkedCache(Cache):
else: else:
self.is_sliding = [False] * config.num_hidden_layers self.is_sliding = [False] * config.num_hidden_layers
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
self.cumulative_length = [0 for _ in range(config.num_hidden_layers)] self.cumulative_length = [0 for _ in range(config.num_hidden_layers)]
def initialise_cache_layer(self, layer_idx, key_states): def initialise_cache_layer(self, layer_idx, key_states):
@ -1880,8 +1880,8 @@ class HybridChunkedCache(Cache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if cache_kwargs is None: if cache_kwargs is None:
cache_kwargs = {} cache_kwargs = {}
cache_position = cache_kwargs.get("cache_position") cache_position = cache_kwargs.get("cache_position")
@ -1968,7 +1968,7 @@ class OffloadedHybridCache(HybridChunkedCache):
device: Union[torch.device, str, None] = None, device: Union[torch.device, str, None] = None,
dtype: torch.dtype = torch.bfloat16, dtype: torch.dtype = torch.bfloat16,
offload_device: Union[str, torch.device] = torch.device("cpu"), offload_device: Union[str, torch.device] = torch.device("cpu"),
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
): ):
super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map) super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map)
@ -2121,8 +2121,8 @@ class MambaCache:
self.ssm_state_size = config.state_size self.ssm_state_size = config.state_size
self.conv_kernel_size = config.conv_kernel self.conv_kernel_size = config.conv_kernel
self.conv_states: List[torch.Tensor] = [] self.conv_states: list[torch.Tensor] = []
self.ssm_states: List[torch.Tensor] = [] self.ssm_states: list[torch.Tensor] = []
device = torch.device(device) if device is not None else None device = torch.device(device) if device is not None else None
for _ in range(config.num_hidden_layers): for _ in range(config.num_hidden_layers):
conv_state: torch.Tensor = torch.zeros( conv_state: torch.Tensor = torch.zeros(
@ -2193,7 +2193,7 @@ class OffloadedStaticCache(StaticCache):
The default `dtype` to use when initializing the cache. The default `dtype` to use when initializing the cache.
offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`): offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`):
The device to offload to. Defaults to CPU. The device to offload to. Defaults to CPU.
layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*): layer_device_map (`dict[int, Union[str, torch.device, int]]`, *optional*):
Mapping between the layers and its device. This is required when you are manually initializing the cache Mapping between the layers and its device. This is required when you are manually initializing the cache
and the model is split between different gpus. You can know which layers mapped to which device by and the model is split between different gpus. You can know which layers mapped to which device by
checking the associated device_map: `model.hf_device_map`. checking the associated device_map: `model.hf_device_map`.
@ -2227,7 +2227,7 @@ class OffloadedStaticCache(StaticCache):
device: Union[str, torch.device], device: Union[str, torch.device],
dtype: Optional[torch.dtype] = None, dtype: Optional[torch.dtype] = None,
offload_device: Union[str, torch.device] = torch.device("cpu"), offload_device: Union[str, torch.device] = torch.device("cpu"),
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
) -> None: ) -> None:
super(Cache, self).__init__() super(Cache, self).__init__()
@ -2255,8 +2255,8 @@ class OffloadedStaticCache(StaticCache):
cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim) cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim)
# Create offloaded CPU tensors. # Create offloaded CPU tensors.
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
for i in range(config.num_hidden_layers): for i in range(config.num_hidden_layers):
# First layer is always on-device. # First layer is always on-device.
@ -2268,8 +2268,8 @@ class OffloadedStaticCache(StaticCache):
self.value_cache.append(value_cache) self.value_cache.append(value_cache)
# Create device tensors. # Create device tensors.
self._device_key_cache: List[torch.Tensor] = [] self._device_key_cache: list[torch.Tensor] = []
self._device_value_cache: List[torch.Tensor] = [] self._device_value_cache: list[torch.Tensor] = []
for i in range(2): for i in range(2):
key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, self.device) key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, self.device)
@ -2289,8 +2289,8 @@ class OffloadedStaticCache(StaticCache):
key_states: torch.Tensor, key_states: torch.Tensor,
value_states: torch.Tensor, value_states: torch.Tensor,
layer_idx: int, layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None, cache_kwargs: Optional[dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
It is VERY important to index using a tensor, otherwise you introduce a copy to the device. It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
@ -2302,7 +2302,7 @@ class OffloadedStaticCache(StaticCache):
The new value states to cache. The new value states to cache.
layer_idx (`int`): layer_idx (`int`):
The index of the layer to cache the states for. The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, *optional*): cache_kwargs (`dict[str, Any]`, *optional*):
Additional arguments for the cache subclass. The `OffloadedStaticCache` needs the Additional arguments for the cache subclass. The `OffloadedStaticCache` needs the
`cache_position` input to know how where to write in the cache. `cache_position` input to know how where to write in the cache.
@ -2401,13 +2401,13 @@ class OffloadedStaticCache(StaticCache):
return self._seen_tokens return self._seen_tokens
def _create_key_value_cache_tensors( def _create_key_value_cache_tensors(
self, shape: Tuple[int, ...], device: torch.device self, shape: tuple[int, ...], device: torch.device
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
"""Creates K/V cache tensors on a device. Pins memory for CPU tensors. Marks them as static """Creates K/V cache tensors on a device. Pins memory for CPU tensors. Marks them as static
addresses for non-CPU tensors. addresses for non-CPU tensors.
Args: Args:
shape (`Tuple[int, ...]`): Shape. shape (`tuple[int, ...]`): Shape.
device (`torch.device`): Device. device (`torch.device`): Device.
Returns: Returns:

View File

@ -23,7 +23,7 @@ from datetime import date
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
from re import Pattern from re import Pattern
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Any, Callable, Optional, Union
import yaml import yaml
@ -148,7 +148,7 @@ def find_indent(line: str) -> int:
return len(search.groups()[0]) return len(search.groups()[0])
def parse_module_content(content: str) -> List[str]: def parse_module_content(content: str) -> list[str]:
""" """
Parse the content of a module in the list of objects it defines. Parse the content of a module in the list of objects it defines.
@ -156,7 +156,7 @@ def parse_module_content(content: str) -> List[str]:
content (`str`): The content to parse content (`str`): The content to parse
Returns: Returns:
`List[str]`: The list of objects defined in the module. `list[str]`: The list of objects defined in the module.
""" """
objects = [] objects = []
current_object = [] current_object = []
@ -336,7 +336,7 @@ def add_content_to_file(
def replace_model_patterns( def replace_model_patterns(
text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns
) -> Tuple[str, str]: ) -> tuple[str, str]:
""" """
Replace all patterns present in a given text. Replace all patterns present in a given text.
@ -414,10 +414,10 @@ def simplify_replacements(replacements):
"BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed. "BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed.
Args: Args:
replacements (`List[Tuple[str, str]]`): List of patterns (old, new) replacements (`list[tuple[str, str]]`): List of patterns (old, new)
Returns: Returns:
`List[Tuple[str, str]]`: The list of patterns simplified. `list[tuple[str, str]]`: The list of patterns simplified.
""" """
if len(replacements) <= 1: if len(replacements) <= 1:
# Nothing to simplify # Nothing to simplify
@ -519,7 +519,7 @@ def duplicate_module(
new_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns,
dest_file: Optional[str] = None, dest_file: Optional[str] = None,
add_copied_from: bool = True, add_copied_from: bool = True,
attrs_to_remove: Optional[List[str]] = None, attrs_to_remove: Optional[list[str]] = None,
): ):
""" """
Create a new module from an existing one and adapting all function and classes names from old patterns to new ones. Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
@ -585,17 +585,17 @@ def duplicate_module(
def filter_framework_files( def filter_framework_files(
files: List[Union[str, os.PathLike]], frameworks: Optional[List[str]] = None files: list[Union[str, os.PathLike]], frameworks: Optional[list[str]] = None
) -> List[Union[str, os.PathLike]]: ) -> list[Union[str, os.PathLike]]:
""" """
Filter a list of files to only keep the ones corresponding to a list of frameworks. Filter a list of files to only keep the ones corresponding to a list of frameworks.
Args: Args:
files (`List[Union[str, os.PathLike]]`): The list of files to filter. files (`list[Union[str, os.PathLike]]`): The list of files to filter.
frameworks (`List[str]`, *optional*): The list of allowed frameworks. frameworks (`list[str]`, *optional*): The list of allowed frameworks.
Returns: Returns:
`List[Union[str, os.PathLike]]`: The list of filtered files. `list[Union[str, os.PathLike]]`: The list of filtered files.
""" """
if frameworks is None: if frameworks is None:
frameworks = get_default_frameworks() frameworks = get_default_frameworks()
@ -617,17 +617,17 @@ def filter_framework_files(
return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others
def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, Union[Path, List[Path]]]: def get_model_files(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, Union[Path, list[Path]]]:
""" """
Retrieves all the files associated to a model. Retrieves all the files associated to a model.
Args: Args:
model_type (`str`): A valid model type (like "bert" or "gpt2") model_type (`str`): A valid model type (like "bert" or "gpt2")
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If passed, will only keep the model files corresponding to the passed frameworks. If passed, will only keep the model files corresponding to the passed frameworks.
Returns: Returns:
`Dict[str, Union[Path, List[Path]]]`: A dictionary with the following keys: `dict[str, Union[Path, list[Path]]]`: A dictionary with the following keys:
- **doc_file** -- The documentation file for the model. - **doc_file** -- The documentation file for the model.
- **model_files** -- All the files in the model module. - **model_files** -- All the files in the model module.
- **test_files** -- The test files for the model. - **test_files** -- The test files for the model.
@ -663,14 +663,14 @@ _re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", fla
def find_base_model_checkpoint( def find_base_model_checkpoint(
model_type: str, model_files: Optional[Dict[str, Union[Path, List[Path]]]] = None model_type: str, model_files: Optional[dict[str, Union[Path, list[Path]]]] = None
) -> str: ) -> str:
""" """
Finds the model checkpoint used in the docstrings for a given model. Finds the model checkpoint used in the docstrings for a given model.
Args: Args:
model_type (`str`): A valid model type (like "bert" or "gpt2") model_type (`str`): A valid model type (like "bert" or "gpt2")
model_files (`Dict[str, Union[Path, List[Path]]`, *optional*): model_files (`dict[str, Union[Path, list[Path]]`, *optional*):
The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed. The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed.
Returns: Returns:
@ -713,18 +713,18 @@ def get_default_frameworks():
_re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES") _re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES")
def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, List[str]]: def retrieve_model_classes(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, list[str]]:
""" """
Retrieve the model classes associated to a given model. Retrieve the model classes associated to a given model.
Args: Args:
model_type (`str`): A valid model type (like "bert" or "gpt2") model_type (`str`): A valid model type (like "bert" or "gpt2")
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict
the classes returned. the classes returned.
Returns: Returns:
`Dict[str, List[str]]`: A dictionary with one key per framework and the list of model classes associated to `dict[str, list[str]]`: A dictionary with one key per framework and the list of model classes associated to
that framework as values. that framework as values.
""" """
if frameworks is None: if frameworks is None:
@ -754,20 +754,20 @@ def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = No
return model_classes return model_classes
def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None): def retrieve_info_for_model(model_type, frameworks: Optional[list[str]] = None):
""" """
Retrieves all the information from a given model_type. Retrieves all the information from a given model_type.
Args: Args:
model_type (`str`): A valid model type (like "bert" or "gpt2") model_type (`str`): A valid model type (like "bert" or "gpt2")
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If passed, will only keep the info corresponding to the passed frameworks. If passed, will only keep the info corresponding to the passed frameworks.
Returns: Returns:
`Dict`: A dictionary with the following keys: `Dict`: A dictionary with the following keys:
- **frameworks** (`List[str]`): The list of frameworks that back this model type. - **frameworks** (`list[str]`): The list of frameworks that back this model type.
- **model_classes** (`Dict[str, List[str]]`): The model classes implemented for that model type. - **model_classes** (`dict[str, list[str]]`): The model classes implemented for that model type.
- **model_files** (`Dict[str, Union[Path, List[Path]]]`): The files associated with that model type. - **model_files** (`dict[str, Union[Path, list[Path]]]`): The files associated with that model type.
- **model_patterns** (`ModelPatterns`): The various patterns for the model. - **model_patterns** (`ModelPatterns`): The various patterns for the model.
""" """
if model_type not in auto_module.MODEL_NAMES_MAPPING: if model_type not in auto_module.MODEL_NAMES_MAPPING:
@ -833,7 +833,7 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
def clean_frameworks_in_init( def clean_frameworks_in_init(
init_file: Union[str, os.PathLike], frameworks: Optional[List[str]] = None, keep_processing: bool = True init_file: Union[str, os.PathLike], frameworks: Optional[list[str]] = None, keep_processing: bool = True
): ):
""" """
Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature
@ -841,7 +841,7 @@ def clean_frameworks_in_init(
Args: Args:
init_file (`str` or `os.PathLike`): The path to the init to treat. init_file (`str` or `os.PathLike`): The path to the init to treat.
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If passed, this will remove all imports that are subject to a framework not in frameworks If passed, this will remove all imports that are subject to a framework not in frameworks
keep_processing (`bool`, *optional*, defaults to `True`): keep_processing (`bool`, *optional*, defaults to `True`):
Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports
@ -914,7 +914,7 @@ def clean_frameworks_in_init(
def add_model_to_main_init( def add_model_to_main_init(
old_model_patterns: ModelPatterns, old_model_patterns: ModelPatterns,
new_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns,
frameworks: Optional[List[str]] = None, frameworks: Optional[list[str]] = None,
with_processing: bool = True, with_processing: bool = True,
): ):
""" """
@ -923,7 +923,7 @@ def add_model_to_main_init(
Args: Args:
old_model_patterns (`ModelPatterns`): The patterns for the old model. old_model_patterns (`ModelPatterns`): The patterns for the old model.
new_model_patterns (`ModelPatterns`): The patterns for the new model. new_model_patterns (`ModelPatterns`): The patterns for the new model.
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If specified, only the models implemented in those frameworks will be added. If specified, only the models implemented in those frameworks will be added.
with_processing (`bool`, *optional*, defaults to `True`): with_processing (`bool`, *optional*, defaults to `True`):
Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not. Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
@ -1068,7 +1068,7 @@ AUTO_CLASSES_PATTERNS = {
def add_model_to_auto_classes( def add_model_to_auto_classes(
old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: Dict[str, List[str]] old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: dict[str, list[str]]
): ):
""" """
Add a model to the relevant mappings in the auto module. Add a model to the relevant mappings in the auto module.
@ -1076,7 +1076,7 @@ def add_model_to_auto_classes(
Args: Args:
old_model_patterns (`ModelPatterns`): The patterns for the old model. old_model_patterns (`ModelPatterns`): The patterns for the old model.
new_model_patterns (`ModelPatterns`): The patterns for the new model. new_model_patterns (`ModelPatterns`): The patterns for the new model.
model_classes (`Dict[str, List[str]]`): A dictionary framework to list of model classes implemented. model_classes (`dict[str, list[str]]`): A dictionary framework to list of model classes implemented.
""" """
for filename in AUTO_CLASSES_PATTERNS: for filename in AUTO_CLASSES_PATTERNS:
# Extend patterns with all model classes if necessary # Extend patterns with all model classes if necessary
@ -1169,7 +1169,7 @@ def duplicate_doc_file(
old_model_patterns: ModelPatterns, old_model_patterns: ModelPatterns,
new_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns,
dest_file: Optional[Union[str, os.PathLike]] = None, dest_file: Optional[Union[str, os.PathLike]] = None,
frameworks: Optional[List[str]] = None, frameworks: Optional[list[str]] = None,
): ):
""" """
Duplicate a documentation file and adapts it for a new model. Duplicate a documentation file and adapts it for a new model.
@ -1180,7 +1180,7 @@ def duplicate_doc_file(
new_model_patterns (`ModelPatterns`): The patterns for the new model. new_model_patterns (`ModelPatterns`): The patterns for the new model.
dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file. dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`. Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file. If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
""" """
with open(doc_file, "r", encoding="utf-8") as f: with open(doc_file, "r", encoding="utf-8") as f:
@ -1320,7 +1320,7 @@ def create_new_model_like(
model_type: str, model_type: str,
new_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns,
add_copied_from: bool = True, add_copied_from: bool = True,
frameworks: Optional[List[str]] = None, frameworks: Optional[list[str]] = None,
old_checkpoint: Optional[str] = None, old_checkpoint: Optional[str] = None,
create_fast_image_processor: bool = False, create_fast_image_processor: bool = False,
): ):
@ -1332,7 +1332,7 @@ def create_new_model_like(
new_model_patterns (`ModelPatterns`): The patterns for the new model. new_model_patterns (`ModelPatterns`): The patterns for the new model.
add_copied_from (`bool`, *optional*, defaults to `True`): add_copied_from (`bool`, *optional*, defaults to `True`):
Whether or not to add "Copied from" statements to all classes in the new model modeling files. Whether or not to add "Copied from" statements to all classes in the new model modeling files.
frameworks (`List[str]`, *optional*): frameworks (`list[str]`, *optional*):
If passed, will limit the duplicate to the frameworks specified. If passed, will limit the duplicate to the frameworks specified.
old_checkpoint (`str`, *optional*): old_checkpoint (`str`, *optional*):
The name of the base checkpoint for the old model. Should be passed along when it can't be automatically The name of the base checkpoint for the old model. Should be passed along when it can't be automatically

View File

@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from argparse import ArgumentParser, Namespace from argparse import ArgumentParser, Namespace
from typing import Any, List, Optional from typing import Any, Optional
from ..pipelines import Pipeline, get_supported_tasks, pipeline from ..pipelines import Pipeline, get_supported_tasks, pipeline
from ..utils import logging from ..utils import logging
@ -69,8 +69,8 @@ class ServeTokenizeResult(BaseModel):
Tokenize result model Tokenize result model
""" """
tokens: List[str] tokens: list[str]
tokens_ids: Optional[List[int]] tokens_ids: Optional[list[int]]
class ServeDeTokenizeResult(BaseModel): class ServeDeTokenizeResult(BaseModel):
@ -196,7 +196,7 @@ class ServeCommand(BaseTransformersCLICommand):
def detokenize( def detokenize(
self, self,
tokens_ids: List[int] = Body(None, embed=True), tokens_ids: list[int] = Body(None, embed=True),
skip_special_tokens: bool = Body(False, embed=True), skip_special_tokens: bool = Body(False, embed=True),
cleanup_tokenization_spaces: bool = Body(True, embed=True), cleanup_tokenization_spaces: bool = Body(True, embed=True),
): ):

View File

@ -63,13 +63,13 @@ class PretrainedConfig(PushToHubMixin):
Some configurations requires inputs to be defined at init and have no default values, usually these are composite configs, Some configurations requires inputs to be defined at init and have no default values, usually these are composite configs,
(but not necessarily) such as [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`]. They have to be initialized from (but not necessarily) such as [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`]. They have to be initialized from
two or more configs of type [`~transformers.PretrainedConfig`]. two or more configs of type [`~transformers.PretrainedConfig`].
- **keys_to_ignore_at_inference** (`List[str]`) -- A list of keys to ignore by default when looking at dictionary - **keys_to_ignore_at_inference** (`list[str]`) -- A list of keys to ignore by default when looking at dictionary
outputs of the model during inference. outputs of the model during inference.
- **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized - **attribute_map** (`dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
naming of attributes. naming of attributes.
- **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor - **base_model_tp_plan** (`dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
parallel plan applied to the sub-module when `model.tensor_parallel` is called. parallel plan applied to the sub-module when `model.tensor_parallel` is called.
- **base_model_pp_plan** (`Dict[str, Tuple[List[str]]]`) -- A dict that maps child-modules of a base model to a - **base_model_pp_plan** (`dict[str, tuple[list[str]]]`) -- A dict that maps child-modules of a base model to a
pipeline parallel plan that enables users to place the child-module on the appropriate device. pipeline parallel plan that enables users to place the child-module on the appropriate device.
Common attributes (present in all subclasses): Common attributes (present in all subclasses):
@ -115,7 +115,7 @@ class PretrainedConfig(PushToHubMixin):
tie_encoder_decoder (`bool`, *optional*, defaults to `False`): tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
and decoder model to have the exact same parameter names. and decoder model to have the exact same parameter names.
prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`): prune_heads (`dict[int, list[int]]`, *optional*, defaults to `{}`):
Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
heads to prune in said layer. heads to prune in said layer.
@ -128,17 +128,17 @@ class PretrainedConfig(PushToHubMixin):
> Parameters for fine-tuning tasks > Parameters for fine-tuning tasks
architectures (`List[str]`, *optional*): architectures (`list[str]`, *optional*):
Model architectures that can be used with the model pretrained weights. Model architectures that can be used with the model pretrained weights.
finetuning_task (`str`, *optional*): finetuning_task (`str`, *optional*):
Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
or PyTorch) checkpoint. or PyTorch) checkpoint.
id2label (`Dict[int, str]`, *optional*): id2label (`dict[int, str]`, *optional*):
A map from index (for instance prediction index, or target index) to label. A map from index (for instance prediction index, or target index) to label.
label2id (`Dict[str, int]`, *optional*): A map from label to index for the model. label2id (`dict[str, int]`, *optional*): A map from label to index for the model.
num_labels (`int`, *optional*): num_labels (`int`, *optional*):
Number of labels to use in the last layer added to the model, typically for a classification task. Number of labels to use in the last layer added to the model, typically for a classification task.
task_specific_params (`Dict[str, Any]`, *optional*): task_specific_params (`dict[str, Any]`, *optional*):
Additional keyword arguments to store for the current task. Additional keyword arguments to store for the current task.
problem_type (`str`, *optional*): problem_type (`str`, *optional*):
Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`, Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
@ -394,7 +394,7 @@ class PretrainedConfig(PushToHubMixin):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace). namespace).
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
""" """
self._set_token_in_kwargs(kwargs) self._set_token_in_kwargs(kwargs)
@ -505,7 +505,7 @@ class PretrainedConfig(PushToHubMixin):
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*): token (`str` or `bool`, *optional*):
@ -531,7 +531,7 @@ class PretrainedConfig(PushToHubMixin):
subfolder (`str`, *optional*, defaults to `""`): subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here. specify the folder name here.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
The values in kwargs of any keys which are configuration attributes will be used to override the loaded The values in kwargs of any keys which are configuration attributes will be used to override the loaded
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
by the `return_unused_kwargs` keyword parameter. by the `return_unused_kwargs` keyword parameter.
@ -599,7 +599,7 @@ class PretrainedConfig(PushToHubMixin):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns: Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
""" """
cls._set_token_in_kwargs(kwargs) cls._set_token_in_kwargs(kwargs)
@ -723,10 +723,10 @@ class PretrainedConfig(PushToHubMixin):
Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters. Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
Args: Args:
config_dict (`Dict[str, Any]`): config_dict (`dict[str, Any]`):
Dictionary that will be used to instantiate the configuration object. Such a dictionary can be Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method. retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Additional parameters from which to initialize the configuration object. Additional parameters from which to initialize the configuration object.
Returns: Returns:
@ -816,7 +816,7 @@ class PretrainedConfig(PushToHubMixin):
Python dictionary. Python dictionary.
Returns: Returns:
Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance. dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
""" """
config_dict = self.to_dict() config_dict = self.to_dict()
@ -874,7 +874,7 @@ class PretrainedConfig(PushToHubMixin):
Serializes this instance to a Python dictionary. Serializes this instance to a Python dictionary.
Returns: Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
if hasattr(self.__class__, "model_type"): if hasattr(self.__class__, "model_type"):
@ -940,7 +940,7 @@ class PretrainedConfig(PushToHubMixin):
Updates attributes of this class with attributes from `config_dict`. Updates attributes of this class with attributes from `config_dict`.
Args: Args:
config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class. config_dict (`dict[str, Any]`): Dictionary of attributes that should be updated for this class.
""" """
for key, value in config_dict.items(): for key, value in config_dict.items():
setattr(self, key, value) setattr(self, key, value)
@ -1163,7 +1163,7 @@ def get_configuration_file(configuration_files: list[str]) -> str:
Get the configuration file to use for this version of transformers. Get the configuration file to use for this version of transformers.
Args: Args:
configuration_files (`List[str]`): The list of available configuration files. configuration_files (`list[str]`): The list of available configuration files.
Returns: Returns:
`str`: The configuration file to use. `str`: The configuration file to use.

View File

@ -18,7 +18,7 @@ import warnings
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import dataclass from dataclasses import dataclass
from random import randint from random import randint
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union from typing import Any, Callable, NewType, Optional, Union
import numpy as np import numpy as np
@ -33,7 +33,7 @@ InputDataClass = NewType("InputDataClass", Any)
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of PyTorch/TensorFlow tensors or NumPy arrays. of PyTorch/TensorFlow tensors or NumPy arrays.
""" """
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]]) DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])
class DataCollatorMixin: class DataCollatorMixin:
@ -72,7 +72,7 @@ def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
return padded return padded
def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]: def default_data_collator(features: list[InputDataClass], return_tensors="pt") -> dict[str, Any]:
""" """
Very simple data collator that simply collates batches of dict-like objects and performs special handling for Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named: potential keys named:
@ -119,13 +119,13 @@ class DefaultDataCollator(DataCollatorMixin):
return_tensors: str = "pt" return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]: def __call__(self, features: list[dict[str, Any]], return_tensors=None) -> dict[str, Any]:
if return_tensors is None: if return_tensors is None:
return_tensors = self.return_tensors return_tensors = self.return_tensors
return default_data_collator(features, return_tensors) return default_data_collator(features, return_tensors)
def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
import torch import torch
if not isinstance(features[0], Mapping): if not isinstance(features[0], Mapping):
@ -161,7 +161,7 @@ def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any
return batch return batch
def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
import tensorflow as tf import tensorflow as tf
if not isinstance(features[0], Mapping): if not isinstance(features[0], Mapping):
@ -202,7 +202,7 @@ def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
return batch return batch
def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
if not isinstance(features[0], Mapping): if not isinstance(features[0], Mapping):
features = [vars(f) for f in features] features = [vars(f) for f in features]
first = features[0] first = features[0]
@ -268,7 +268,7 @@ class DataCollatorWithPadding:
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pt" return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
batch = pad_without_fast_tokenizer_warning( batch = pad_without_fast_tokenizer_warning(
self.tokenizer, self.tokenizer,
features, features,
@ -569,7 +569,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pt" return_tensors: str = "pt"
def torch_call(self, examples: List[Dict[str, Any]]): # Refactored implementation from the docs. def torch_call(self, examples: list[dict[str, Any]]): # Refactored implementation from the docs.
import torch import torch
# Take labels out of the examples beforehand, because they aren't nested. # Take labels out of the examples beforehand, because they aren't nested.
@ -911,7 +911,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
def tf_mask_tokens( def tf_mask_tokens(
self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
) -> Tuple[Any, Any]: ) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
""" """
@ -956,7 +956,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
# The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
import tensorflow as tf import tensorflow as tf
if self.seed and self.generator is None: if self.seed and self.generator is None:
@ -1002,7 +1002,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
batch["labels"] = labels batch["labels"] = labels
return batch return batch
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
# Handle dict or lists with proper padding and conversion to tensor. # Handle dict or lists with proper padding and conversion to tensor.
if self.seed and self.generator is None: if self.seed and self.generator is None:
@ -1032,7 +1032,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
batch["labels"] = labels batch["labels"] = labels
return batch return batch
def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]: def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
""" """
@ -1081,7 +1081,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
# The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
# Handle dict or lists with proper padding and conversion to tensor. # Handle dict or lists with proper padding and conversion to tensor.
if self.seed and self.generator is None: if self.seed and self.generator is None:
@ -1111,7 +1111,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
batch["labels"] = labels batch["labels"] = labels
return batch return batch
def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]: def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
""" """
@ -1193,7 +1193,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
</Tip>""" </Tip>"""
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if self.seed and self.generator is None: if self.seed and self.generator is None:
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator. # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
# If no seed supplied, we will use the global RNG # If no seed supplied, we will use the global RNG
@ -1226,7 +1226,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
inputs, labels = self.torch_mask_tokens(batch_input, batch_mask) inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels} return {"input_ids": inputs, "labels": labels}
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
import tensorflow as tf import tensorflow as tf
if self.seed and self.generator is None: if self.seed and self.generator is None:
@ -1261,7 +1261,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask) inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
return {"input_ids": inputs, "labels": labels} return {"input_ids": inputs, "labels": labels}
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if self.seed and self.generator is None: if self.seed and self.generator is None:
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator. # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
# If no seed supplied, we will use the global RNG # If no seed supplied, we will use the global RNG
@ -1318,7 +1318,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
self.generator.shuffle(cand_indexes) self.generator.shuffle(cand_indexes)
return cand_indexes return cand_indexes
def _whole_word_mask(self, input_tokens: List[str], max_predictions=512): def _whole_word_mask(self, input_tokens: list[str], max_predictions=512):
""" """
Get 0/1 labels for masked tokens with whole word mask proxy Get 0/1 labels for masked tokens with whole word mask proxy
""" """
@ -1358,7 +1358,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))] mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
return mask_labels return mask_labels
def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1414,7 +1414,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
# The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged # The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1474,7 +1474,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
# The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1564,7 +1564,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
FutureWarning, FutureWarning,
) )
def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]: def __call__(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
import torch import torch
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
@ -1587,7 +1587,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
"sentence_order_label": sentence_order_label, "sentence_order_label": sentence_order_label,
} }
def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]: def mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any]:
""" """
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
original. N-gram not applied yet. original. N-gram not applied yet.
@ -1645,28 +1645,28 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
max_span_length: int = 5 # maximum length of a span of masked tokens max_span_length: int = 5 # maximum length of a span of masked tokens
return_tensors: str = "pt" return_tensors: str = "pt"
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if isinstance(examples[0], Mapping): if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples] examples = [e["input_ids"] for e in examples]
batch = _torch_collate_batch(examples, self.tokenizer) batch = _torch_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch) inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if isinstance(examples[0], Mapping): if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples] examples = [e["input_ids"] for e in examples]
batch = _tf_collate_batch(examples, self.tokenizer) batch = _tf_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch) inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
if isinstance(examples[0], Mapping): if isinstance(examples[0], Mapping):
examples = [e["input_ids"] for e in examples] examples = [e["input_ids"] for e in examples]
batch = _numpy_collate_batch(examples, self.tokenizer) batch = _numpy_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch) inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: def torch_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
@ -1765,7 +1765,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
return inputs.long(), perm_mask, target_mapping, labels.long() return inputs.long(), perm_mask, target_mapping, labels.long()
def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
@ -1872,7 +1872,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64) return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:

View File

@ -17,7 +17,7 @@ import time
import warnings import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from typing import List, Optional, Union from typing import Optional, Union
import torch import torch
from filelock import FileLock from filelock import FileLock
@ -75,7 +75,7 @@ class GlueDataset(Dataset):
args: GlueDataTrainingArguments args: GlueDataTrainingArguments
output_mode: str output_mode: str
features: List[InputFeatures] features: list[InputFeatures]
def __init__( def __init__(
self, self,

View File

@ -18,7 +18,7 @@ import pickle
import random import random
import time import time
import warnings import warnings
from typing import Dict, List, Optional from typing import Optional
import torch import torch
from filelock import FileLock from filelock import FileLock
@ -139,7 +139,7 @@ class LineByLineTextDataset(Dataset):
def __len__(self): def __len__(self):
return len(self.examples) return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]: def __getitem__(self, i) -> dict[str, torch.tensor]:
return self.examples[i] return self.examples[i]
@ -187,7 +187,7 @@ class LineByLineWithRefDataset(Dataset):
def __len__(self): def __len__(self):
return len(self.examples) return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]: def __getitem__(self, i) -> dict[str, torch.tensor]:
return self.examples[i] return self.examples[i]
@ -339,7 +339,7 @@ class LineByLineWithSOPTextDataset(Dataset):
def __len__(self): def __len__(self):
return len(self.examples) return len(self.examples)
def __getitem__(self, i) -> Dict[str, torch.tensor]: def __getitem__(self, i) -> dict[str, torch.tensor]:
return self.examples[i] return self.examples[i]
@ -433,7 +433,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
) )
def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int): def create_examples_from_document(self, document: list[list[int]], doc_index: int, block_size: int):
"""Creates examples for a single document.""" """Creates examples for a single document."""
max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True) max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

View File

@ -16,7 +16,7 @@ import os
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from typing import Dict, List, Optional, Union from typing import Optional, Union
import torch import torch
from filelock import FileLock from filelock import FileLock
@ -112,7 +112,7 @@ class SquadDataset(Dataset):
""" """
args: SquadDataTrainingArguments args: SquadDataTrainingArguments
features: List[SquadFeatures] features: list[SquadFeatures]
mode: Split mode: Split
is_language_sensitive: bool is_language_sensitive: bool
@ -195,7 +195,7 @@ class SquadDataset(Dataset):
def __len__(self): def __len__(self):
return len(self.features) return len(self.features)
def __getitem__(self, i) -> Dict[str, torch.Tensor]: def __getitem__(self, i) -> dict[str, torch.Tensor]:
# Convert to Tensors and build dataset # Convert to Tensors and build dataset
feature = self.features[i] feature = self.features[i]

View File

@ -19,7 +19,7 @@ import os
import warnings import warnings
from dataclasses import asdict from dataclasses import asdict
from enum import Enum from enum import Enum
from typing import List, Optional, Union from typing import Optional, Union
from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_tf_available, logging from ...utils import is_tf_available, logging
@ -39,7 +39,7 @@ DEPRECATION_WARNING = (
def glue_convert_examples_to_features( def glue_convert_examples_to_features(
examples: Union[List[InputExample], "tf.data.Dataset"], examples: Union[list[InputExample], "tf.data.Dataset"],
tokenizer: PreTrainedTokenizer, tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None, max_length: Optional[int] = None,
task=None, task=None,
@ -107,7 +107,7 @@ if is_tf_available():
def _glue_convert_examples_to_features( def _glue_convert_examples_to_features(
examples: List[InputExample], examples: list[InputExample],
tokenizer: PreTrainedTokenizer, tokenizer: PreTrainedTokenizer,
max_length: Optional[int] = None, max_length: Optional[int] = None,
task=None, task=None,

View File

@ -18,7 +18,7 @@ import csv
import dataclasses import dataclasses
import json import json
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Union from typing import Optional, Union
from ...utils import is_tf_available, is_torch_available, logging from ...utils import is_tf_available, is_torch_available, logging
@ -67,9 +67,9 @@ class InputFeatures:
float for regression problems. float for regression problems.
""" """
input_ids: List[int] input_ids: list[int]
attention_mask: Optional[List[int]] = None attention_mask: Optional[list[int]] = None
token_type_ids: Optional[List[int]] = None token_type_ids: Optional[list[int]] = None
label: Optional[Union[int, float]] = None label: Optional[Union[int, float]] = None
def to_json_string(self): def to_json_string(self):

View File

@ -136,7 +136,7 @@ class DebugUnderflowOverflow:
The model to debug. The model to debug.
max_frames_to_save (`int`, *optional*, defaults to 21): max_frames_to_save (`int`, *optional*, defaults to 21):
How many frames back to record How many frames back to record
trace_batch_nums(`List[int]`, *optional*, defaults to `[]`): trace_batch_nums(`list[int]`, *optional*, defaults to `[]`):
Which batch numbers to trace (turns detection off) Which batch numbers to trace (turns detection off)
abort_after_batch_num (`int``, *optional*): abort_after_batch_num (`int``, *optional*):
Whether to abort after a certain batch number has finished Whether to abort after a certain batch number has finished

View File

@ -317,7 +317,7 @@ def get_cached_module_file(
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or *bool*, *optional*): token (`str` or *bool*, *optional*):
@ -507,7 +507,7 @@ def get_class_from_dynamic_module(
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*): token (`str` or `bool`, *optional*):
@ -593,7 +593,7 @@ def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Option
A config in which to register the auto_map corresponding to this custom object. A config in which to register the auto_map corresponding to this custom object.
Returns: Returns:
`List[str]`: The list of files saved. `list[str]`: The list of files saved.
""" """
if obj.__module__ == "__main__": if obj.__module__ == "__main__":
logger.warning( logger.warning(
@ -762,7 +762,7 @@ def check_python_requirements(path_or_repo_id, requirements_file="requirements.t
This can be either: This can be either:
- a string, the *model id* of a model repo on huggingface.co. - a string, the *model id* of a model repo on huggingface.co.
- a path to a *directory* potentially containing the file. - a path to a *directory* potentially containing the file.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional arguments to pass to `cached_file`. Additional arguments to pass to `cached_file`.
""" """
failed = [] # error messages regarding requirements failed = [] # error messages regarding requirements

View File

@ -81,13 +81,13 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
</Tip> </Tip>
Args: Args:
processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`): processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str, input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
collate function. collate function.
Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type. see the note above for the return type.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding Select a strategy to pad the returned sequences (according to the model's padding side and padding
@ -235,9 +235,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
Pad inputs (on left/right and up to predefined length or max length in the batch) Pad inputs (on left/right and up to predefined length or max length in the batch)
Args: Args:
processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`): processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`) of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
max_length (`int`, *optional*): max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see below) Maximum length of the returned list and optionally padding length (see below)
padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`): padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
@ -306,9 +306,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
Truncate inputs to predefined length or max length in the batch Truncate inputs to predefined length or max length in the batch
Args: Args:
processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`): processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`) of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
max_length (`int`, *optional*): max_length (`int`, *optional*):
maximum length of the returned list and optionally padding length (see below) maximum length of the returned list and optionally padding length (see below)
pad_to_multiple_of (`int`, *optional*) : pad_to_multiple_of (`int`, *optional*) :

View File

@ -303,7 +303,7 @@ class FeatureExtractionMixin(PushToHubMixin):
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*): token (`str` or `bool`, *optional*):
@ -326,7 +326,7 @@ class FeatureExtractionMixin(PushToHubMixin):
functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
`kwargs` which has not been used to update `feature_extractor` and is otherwise ignored. `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
controlled by the `return_unused_kwargs` keyword parameter. controlled by the `return_unused_kwargs` keyword parameter.
@ -392,7 +392,7 @@ class FeatureExtractionMixin(PushToHubMixin):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace). namespace).
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
""" """
use_auth_token = kwargs.pop("use_auth_token", None) use_auth_token = kwargs.pop("use_auth_token", None)
@ -454,7 +454,7 @@ class FeatureExtractionMixin(PushToHubMixin):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns: Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
""" """
cache_dir = kwargs.pop("cache_dir", None) cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False) force_download = kwargs.pop("force_download", False)
@ -555,11 +555,11 @@ class FeatureExtractionMixin(PushToHubMixin):
parameters. parameters.
Args: Args:
feature_extractor_dict (`Dict[str, Any]`): feature_extractor_dict (`dict[str, Any]`):
Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the retrieved from a pretrained checkpoint by leveraging the
[`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method. [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Additional parameters from which to initialize the feature extractor object. Additional parameters from which to initialize the feature extractor object.
Returns: Returns:
@ -588,7 +588,7 @@ class FeatureExtractionMixin(PushToHubMixin):
def to_dict(self) -> dict[str, Any]: def to_dict(self) -> dict[str, Any]:
""" """
Serializes this instance to a Python dictionary. Returns: Serializes this instance to a Python dictionary. Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__ output["feature_extractor_type"] = self.__class__.__name__

View File

@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Optional from typing import Optional
class Constraint(ABC): class Constraint(ABC):
@ -51,7 +51,7 @@ class Constraint(ABC):
When called, returns the token(s) that would take this constraint one step closer to being fulfilled. When called, returns the token(s) that would take this constraint one step closer to being fulfilled.
Return: Return:
token_ids (Union[int, List[int], None]): token_ids (Union[int, list[int], None]):
- A single token ID (int) that advances the constraint, or - A single token ID (int) that advances the constraint, or
- A list of token IDs that could advance the constraint - A list of token IDs that could advance the constraint
- None if the constraint is completed or cannot be advanced - None if the constraint is completed or cannot be advanced
@ -134,11 +134,11 @@ class PhrasalConstraint(Constraint):
[`Constraint`] enforcing that an ordered sequence of tokens is included in the output. [`Constraint`] enforcing that an ordered sequence of tokens is included in the output.
Args: Args:
token_ids (`List[int]`): token_ids (`list[int]`):
The id of the token that must be generated by the output. The id of the token that must be generated by the output.
""" """
def __init__(self, token_ids: List[int]): def __init__(self, token_ids: list[int]):
super(Constraint, self).__init__() super(Constraint, self).__init__()
if not isinstance(token_ids, list) or len(token_ids) == 0: if not isinstance(token_ids, list) or len(token_ids) == 0:
@ -205,7 +205,7 @@ class PhrasalConstraint(Constraint):
class DisjunctiveTrie: class DisjunctiveTrie:
def __init__(self, nested_token_ids: List[List[int]], no_subsets=True): def __init__(self, nested_token_ids: list[list[int]], no_subsets=True):
r""" r"""
A helper class that builds a trie with the words represented in `nested_token_ids`. A helper class that builds a trie with the words represented in `nested_token_ids`.
""" """
@ -266,12 +266,12 @@ class DisjunctiveConstraint(Constraint):
A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints. A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
Args: Args:
nested_token_ids (`List[List[int]]`): nested_token_ids (`list[list[int]]`):
A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
the list of words. the list of words.
""" """
def __init__(self, nested_token_ids: List[List[int]]): def __init__(self, nested_token_ids: list[list[int]]):
super(Constraint, self).__init__() super(Constraint, self).__init__()
if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0: if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0:
@ -356,11 +356,11 @@ class ConstraintListState:
A class for beam scorers to track its progress through a list of constraints. A class for beam scorers to track its progress through a list of constraints.
Args: Args:
constraints (`List[Constraint]`): constraints (`list[Constraint]`):
A list of [`Constraint`] objects that must be fulfilled by the beam scorer. A list of [`Constraint`] objects that must be fulfilled by the beam scorer.
""" """
def __init__(self, constraints: List[Constraint]): def __init__(self, constraints: list[Constraint]):
self.constraints = constraints self.constraints = constraints
# max # of steps required to fulfill a given constraint # max # of steps required to fulfill a given constraint
@ -418,7 +418,7 @@ class ConstraintListState:
else: else:
return token_list return token_list
def reset(self, token_ids: Optional[List[int]]): def reset(self, token_ids: Optional[list[int]]):
""" """
token_ids: the tokens generated thus far to reset the state of the progress through constraints. token_ids: the tokens generated thus far to reset the state of the progress through constraints.
""" """

View File

@ -15,7 +15,7 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import UserDict from collections import UserDict
from typing import Dict, List, Optional, Tuple, Union from typing import Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -41,7 +41,7 @@ PROCESS_INPUTS_DOCSTRING = r"""
Beam indices indicating to which beam hypothesis the `next_tokens` correspond. Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
beam_indices (`torch.LongTensor`, *optional*): beam_indices (`torch.LongTensor`, *optional*):
Beam indices indicating to which beam hypothesis each token correspond. Beam indices indicating to which beam hypothesis each token correspond.
@ -77,7 +77,7 @@ FINALIZE_INPUTS_DOCSTRING = r"""
The beam indices indicating to which beam the `final_beam_tokens` shall be added. The beam indices indicating to which beam the `final_beam_tokens` shall be added.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
Return: Return:
@ -103,7 +103,7 @@ class BeamScorer(ABC):
next_tokens: torch.LongTensor, next_tokens: torch.LongTensor,
next_indices: torch.LongTensor, next_indices: torch.LongTensor,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
raise NotImplementedError("This is an abstract method.") raise NotImplementedError("This is an abstract method.")
@abstractmethod @abstractmethod
@ -219,11 +219,11 @@ class BeamSearchScorer(BeamScorer):
next_tokens: torch.LongTensor, next_tokens: torch.LongTensor,
next_indices: torch.LongTensor, next_indices: torch.LongTensor,
pad_token_id: Optional[Union[int, torch.Tensor]] = None, pad_token_id: Optional[Union[int, torch.Tensor]] = None,
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
beam_indices: Optional[torch.LongTensor] = None, beam_indices: Optional[torch.LongTensor] = None,
group_index: Optional[int] = 0, group_index: Optional[int] = 0,
decoder_prompt_len: Optional[int] = 0, decoder_prompt_len: Optional[int] = 0,
) -> Dict[str, torch.Tensor]: ) -> dict[str, torch.Tensor]:
# add up to the length which the next_scores is calculated on (including decoder prompt) # add up to the length which the next_scores is calculated on (including decoder prompt)
cur_len = input_ids.shape[-1] + 1 cur_len = input_ids.shape[-1] + 1
batch_size = len(self._beam_hyps) // self.num_beam_groups batch_size = len(self._beam_hyps) // self.num_beam_groups
@ -325,10 +325,10 @@ class BeamSearchScorer(BeamScorer):
final_beam_indices: torch.LongTensor, final_beam_indices: torch.LongTensor,
max_length: int, max_length: int,
pad_token_id: Optional[Union[int, torch.Tensor]] = None, pad_token_id: Optional[Union[int, torch.Tensor]] = None,
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
beam_indices: Optional[torch.LongTensor] = None, beam_indices: Optional[torch.LongTensor] = None,
decoder_prompt_len: Optional[int] = 0, decoder_prompt_len: Optional[int] = 0,
) -> Tuple[torch.LongTensor]: ) -> tuple[torch.LongTensor]:
batch_size = len(self._beam_hyps) // self.num_beam_groups batch_size = len(self._beam_hyps) // self.num_beam_groups
if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor): if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
@ -426,7 +426,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
Batch Size of `input_ids` for which standard beam search decoding is run in parallel. Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
num_beams (`int`): num_beams (`int`):
Number of beams for beam search. Number of beams for beam search.
constraints (`List[Constraint]`): constraints (`list[Constraint]`):
A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation
output. For more information, the documentation of [`Constraint`] should be read. output. For more information, the documentation of [`Constraint`] should be read.
device (`torch.device`): device (`torch.device`):
@ -457,7 +457,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
self, self,
batch_size: int, batch_size: int,
num_beams: int, num_beams: int,
constraints: List[Constraint], constraints: list[Constraint],
device: torch.device, device: torch.device,
length_penalty: Optional[float] = 1.0, length_penalty: Optional[float] = 1.0,
do_early_stopping: Optional[Union[bool, str]] = False, do_early_stopping: Optional[Union[bool, str]] = False,
@ -518,10 +518,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
next_indices: torch.LongTensor, next_indices: torch.LongTensor,
scores_for_all_vocab: torch.FloatTensor, scores_for_all_vocab: torch.FloatTensor,
pad_token_id: Optional[Union[int, torch.Tensor]] = None, pad_token_id: Optional[Union[int, torch.Tensor]] = None,
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
beam_indices: Optional[torch.LongTensor] = None, beam_indices: Optional[torch.LongTensor] = None,
decoder_prompt_len: Optional[int] = 0, decoder_prompt_len: Optional[int] = 0,
) -> Tuple[torch.Tensor]: ) -> tuple[torch.Tensor]:
r""" r"""
Args: Args:
input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
@ -541,7 +541,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
The scores of all tokens in the vocabulary for each of the beam hypotheses. The scores of all tokens in the vocabulary for each of the beam hypotheses.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
beam_indices (`torch.LongTensor`, *optional*): beam_indices (`torch.LongTensor`, *optional*):
Beam indices indicating to which beam hypothesis each token correspond. Beam indices indicating to which beam hypothesis each token correspond.
@ -818,10 +818,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
final_beam_indices: torch.LongTensor, final_beam_indices: torch.LongTensor,
max_length: int, max_length: int,
pad_token_id: Optional[Union[int, torch.Tensor]] = None, pad_token_id: Optional[Union[int, torch.Tensor]] = None,
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None, eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
beam_indices: Optional[torch.LongTensor] = None, beam_indices: Optional[torch.LongTensor] = None,
decoder_prompt_len: Optional[int] = 0, decoder_prompt_len: Optional[int] = 0,
) -> Tuple[torch.LongTensor]: ) -> tuple[torch.LongTensor]:
batch_size = len(self._beam_hyps) batch_size = len(self._beam_hyps)
if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor): if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):

View File

@ -15,7 +15,7 @@
import copy import copy
import weakref import weakref
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple from typing import TYPE_CHECKING, Any, Optional
import numpy as np import numpy as np
import torch import torch
@ -44,7 +44,7 @@ from ..utils.deprecation import deprecate_kwarg
class CandidateGenerator: class CandidateGenerator:
"""Abstract base class for all candidate generators that can be applied during assisted generation.""" """Abstract base class for all candidate generators that can be applied during assisted generation."""
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
""" """
Fetches the candidates to be tried for the current input. Fetches the candidates to be tried for the current input.
@ -108,7 +108,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
assistant_model: "PreTrainedModel", assistant_model: "PreTrainedModel",
generation_config: "GenerationConfig", generation_config: "GenerationConfig",
model_kwargs: Dict, model_kwargs: dict,
inputs_tensor: Optional[torch.Tensor] = None, inputs_tensor: Optional[torch.Tensor] = None,
logits_processor: "LogitsProcessorList" = None, logits_processor: "LogitsProcessorList" = None,
): ):
@ -198,7 +198,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
self.probs = [] self.probs = []
self.matches = [] self.matches = []
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
""" """
Fetches the candidates to be tried for the current input. Fetches the candidates to be tried for the current input.
@ -281,7 +281,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold
def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]: def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> tuple[int, int]:
"""Calculate the minimum and maximum number of new tokens to generate.""" """Calculate the minimum and maximum number of new tokens to generate."""
new_cur_len = input_ids.shape[-1] new_cur_len = input_ids.shape[-1]
max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
@ -305,7 +305,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
return has_past_key_values return has_past_key_values
def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict: def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> dict:
"""Prepare arguments for the generation call.""" """Prepare arguments for the generation call."""
return { return {
self.input_ids_key: input_ids, self.input_ids_key: input_ids,
@ -315,7 +315,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
"logits_processor": self.logits_processor, "logits_processor": self.logits_processor,
} }
def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def _generate_candidates(self, generation_args: dict) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
"""Generate candidate sequences using the assistant model.""" """Generate candidate sequences using the assistant model."""
assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
@ -374,7 +374,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
target_tokenizer: "PreTrainedTokenizerBase", target_tokenizer: "PreTrainedTokenizerBase",
assistant_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase",
generation_config: "GenerationConfig", generation_config: "GenerationConfig",
model_kwargs: Dict, model_kwargs: dict,
inputs_tensor: Optional[torch.Tensor] = None, inputs_tensor: Optional[torch.Tensor] = None,
logits_processor: "LogitsProcessorList" = None, logits_processor: "LogitsProcessorList" = None,
): ):
@ -495,7 +495,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"] dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
return dest_ids.to(input_ids.device) return dest_ids.to(input_ids.device)
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
""" """
Fetches the candidates to be tried for the current input. Fetches the candidates to be tried for the current input.
@ -537,7 +537,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
return new_target_ids, None return new_target_ids, None
def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]: def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, int]:
"""Converts target input IDs to assistant input IDs, handling discrepancies.""" """Converts target input IDs to assistant input IDs, handling discrepancies."""
convert_kwargs = { convert_kwargs = {
"source_tokenizer": self.target_tokenizer, "source_tokenizer": self.target_tokenizer,
@ -782,7 +782,7 @@ class AssistantToTargetTranslator:
max_assistant_index = max(assistant_vocab.values()) max_assistant_index = max(assistant_vocab.values())
assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.SUPPRESS_TOKEN_ID, dtype=int) assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.SUPPRESS_TOKEN_ID, dtype=int)
target_to_assistant_input_ids: Dict[int, int] = {} target_to_assistant_input_ids: dict[int, int] = {}
for tok, assistant_id in assistant_vocab.items(): for tok, assistant_id in assistant_vocab.items():
target_id = target_vocab.get(tok) target_id = target_vocab.get(tok)
if target_id is not None: if target_id is not None:
@ -909,7 +909,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
target_tokenizer: "PreTrainedTokenizerBase", target_tokenizer: "PreTrainedTokenizerBase",
assistant_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase",
generation_config: "GenerationConfig", generation_config: "GenerationConfig",
model_kwargs: Dict, model_kwargs: dict,
atm_translator: AssistantToTargetTranslator, atm_translator: AssistantToTargetTranslator,
inputs_tensor: Optional[torch.Tensor] = None, inputs_tensor: Optional[torch.Tensor] = None,
logits_processor: "LogitsProcessorList" = None, logits_processor: "LogitsProcessorList" = None,
@ -930,7 +930,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
self._target_seq_len_with_candidates: int = 0 self._target_seq_len_with_candidates: int = 0
self._prev_assistant_ids: Optional[torch.LongTensor] = None self._prev_assistant_ids: Optional[torch.LongTensor] = None
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
""" """
Simplified version of get_candidates that uses the translator cache for token conversion. Simplified version of get_candidates that uses the translator cache for token conversion.
""" """
@ -1043,7 +1043,7 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0: if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
raise ValueError("Invalid max_matching_ngram_size or num_output_tokens") raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
""" """
Fetches the candidates to be tried for the current input. Fetches the candidates to be tried for the current input.
@ -1153,7 +1153,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
assistant_model: "PreTrainedModel", assistant_model: "PreTrainedModel",
generation_config: "GenerationConfig", generation_config: "GenerationConfig",
model_kwargs: Dict, model_kwargs: dict,
inputs_tensor: Optional[torch.Tensor] = None, inputs_tensor: Optional[torch.Tensor] = None,
logits_processor: "LogitsProcessorList" = None, logits_processor: "LogitsProcessorList" = None,
): ):
@ -1170,7 +1170,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
self.assistant_early_exit = self.generation_config.assistant_early_exit self.assistant_early_exit = self.generation_config.assistant_early_exit
self.generation_config.assistant_early_exit = None self.generation_config.assistant_early_exit = None
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
# Temporarily sets the number of hidden layers to the early exit value # Temporarily sets the number of hidden layers to the early exit value
base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix) base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
original_num_hidden_layers = base_model.config.num_hidden_layers original_num_hidden_layers = base_model.config.num_hidden_layers
@ -1221,7 +1221,7 @@ def _crop_past_key_values(model, past_key_values, max_length):
return past_key_values return past_key_values
def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_encoder_decoder: bool) -> Dict[str, Any]: def _prepare_attention_mask(model_kwargs: dict[str, Any], new_length: int, is_encoder_decoder: bool) -> dict[str, Any]:
"""Expands or crops the model's mask for decoding purposes, to the defined length""" """Expands or crops the model's mask for decoding purposes, to the defined length"""
mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask" mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
@ -1257,7 +1257,7 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
return model_kwargs return model_kwargs
def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]: def _prepare_token_type_ids(model_kwargs: dict[str, Any], new_length: int) -> dict[str, Any]:
"""Expands or crops the model's token_type_ids for decoding purposes, to the defined length""" """Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None: if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
return model_kwargs return model_kwargs

View File

@ -20,7 +20,7 @@ import os
import warnings import warnings
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass, is_dataclass from dataclasses import dataclass, is_dataclass
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from typing import TYPE_CHECKING, Any, Callable, Optional, Union
from .. import __version__ from .. import __version__
from ..configuration_utils import PretrainedConfig from ..configuration_utils import PretrainedConfig
@ -149,7 +149,7 @@ class GenerationConfig(PushToHubMixin):
max_time (`float`, *optional*): max_time (`float`, *optional*):
The maximum amount of time you allow the computation to run for in seconds. generation will still finish The maximum amount of time you allow the computation to run for in seconds. generation will still finish
the current pass after allocated time has been passed. the current pass after allocated time has been passed.
stop_strings (`str or List[str]`, *optional*): stop_strings (`str or list[str]`, *optional*):
A string or a list of strings that should terminate generation if the model outputs them. A string or a list of strings that should terminate generation if the model outputs them.
> Parameters that control the generation strategy used > Parameters that control the generation strategy used
@ -163,7 +163,7 @@ class GenerationConfig(PushToHubMixin):
[this paper](https://huggingface.co/papers/1610.02424) for more details. [this paper](https://huggingface.co/papers/1610.02424) for more details.
penalty_alpha (`float`, *optional*): penalty_alpha (`float`, *optional*):
The values balance the model confidence and the degeneration penalty in contrastive search decoding. The values balance the model confidence and the degeneration penalty in contrastive search decoding.
dola_layers (`str` or `List[int]`, *optional*): dola_layers (`str` or `list[int]`, *optional*):
The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively. be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
"low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the "low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
@ -245,26 +245,26 @@ class GenerationConfig(PushToHubMixin):
`length_penalty` < 0.0 encourages shorter sequences. `length_penalty` < 0.0 encourages shorter sequences.
no_repeat_ngram_size (`int`, *optional*, defaults to 0): no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size can only occur once. If set to int > 0, all ngrams of that size can only occur once.
bad_words_ids (`List[List[int]]`, *optional*): bad_words_ids (`list[list[int]]`, *optional*):
List of list of token ids that are not allowed to be generated. Check List of list of token ids that are not allowed to be generated. Check
[`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples. [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*): force_words_ids (`list[list[int]]` or `list[list[list[int]]]`, *optional*):
List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of List of token ids that must be generated. If given a `list[list[int]]`, this is treated as a simple list of
words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this words that must be included, the opposite to `bad_words_ids`. If given `list[list[list[int]]]`, this
triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
can allow different forms of each word. can allow different forms of each word.
renormalize_logits (`bool`, *optional*, defaults to `False`): renormalize_logits (`bool`, *optional*, defaults to `False`):
Whether to renormalize the logits after applying all the logits processors (including the custom Whether to renormalize the logits after applying all the logits processors (including the custom
ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
are normalized but some logit processors break the normalization. are normalized but some logit processors break the normalization.
constraints (`List[Constraint]`, *optional*): constraints (`list[Constraint]`, *optional*):
Custom constraints that can be added to the generation to ensure that the output will contain the use of Custom constraints that can be added to the generation to ensure that the output will contain the use of
certain tokens as defined by `Constraint` objects, in the most sensible way possible. certain tokens as defined by `Constraint` objects, in the most sensible way possible.
forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`): forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
language token. language token.
forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`): forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
list to set multiple *end-of-sequence* tokens. list to set multiple *end-of-sequence* tokens.
remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`): remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
@ -274,13 +274,13 @@ class GenerationConfig(PushToHubMixin):
This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
penalty starts and `decay_factor` represents the factor of exponential decay penalty starts and `decay_factor` represents the factor of exponential decay
suppress_tokens (`List[int]`, *optional*): suppress_tokens (`list[int]`, *optional*):
A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
log probs to `-inf` so that they are not sampled. log probs to `-inf` so that they are not sampled.
begin_suppress_tokens (`List[int]`, *optional*): begin_suppress_tokens (`list[int]`, *optional*):
A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit
processor will set their log probs to `-inf` so that they are not sampled. processor will set their log probs to `-inf` so that they are not sampled.
sequence_bias (`Dict[Tuple[int], float]`, *optional*)): sequence_bias (`dict[tuple[int], float]`, *optional*)):
Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
sequence being selected, while negative biases do the opposite. Check sequence being selected, while negative biases do the opposite. Check
[`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples. [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
@ -325,7 +325,7 @@ class GenerationConfig(PushToHubMixin):
The id of the *padding* token. The id of the *padding* token.
bos_token_id (`int`, *optional*): bos_token_id (`int`, *optional*):
The id of the *beginning-of-sequence* token. The id of the *beginning-of-sequence* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
> Generation parameters exclusive to encoder-decoder models > Generation parameters exclusive to encoder-decoder models
@ -333,7 +333,7 @@ class GenerationConfig(PushToHubMixin):
encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0): encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
`decoder_input_ids`. `decoder_input_ids`.
decoder_start_token_id (`int` or `List[int]`, *optional*): decoder_start_token_id (`int` or `list[int]`, *optional*):
If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
`batch_size`. Indicating a list enables different start ids for each element in the batch `batch_size`. Indicating a list enables different start ids for each element in the batch
(e.g. multilingual models with different target languages in one batch) (e.g. multilingual models with different target languages in one batch)
@ -846,7 +846,7 @@ class GenerationConfig(PushToHubMixin):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace). namespace).
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
""" """
@ -933,7 +933,7 @@ class GenerationConfig(PushToHubMixin):
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*): token (`str` or `bool`, *optional*):
@ -959,7 +959,7 @@ class GenerationConfig(PushToHubMixin):
subfolder (`str`, *optional*, defaults to `""`): subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here. specify the folder name here.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
The values in kwargs of any keys which are configuration attributes will be used to override the loaded The values in kwargs of any keys which are configuration attributes will be used to override the loaded
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
by the `return_unused_kwargs` keyword parameter. by the `return_unused_kwargs` keyword parameter.
@ -1090,14 +1090,14 @@ class GenerationConfig(PushToHubMixin):
return json.loads(text) return json.loads(text)
@classmethod @classmethod
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig": def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "GenerationConfig":
""" """
Instantiates a [`GenerationConfig`] from a Python dictionary of parameters. Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
Args: Args:
config_dict (`Dict[str, Any]`): config_dict (`dict[str, Any]`):
Dictionary that will be used to instantiate the configuration object. Dictionary that will be used to instantiate the configuration object.
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Additional parameters from which to initialize the configuration object. Additional parameters from which to initialize the configuration object.
Returns: Returns:
@ -1123,7 +1123,7 @@ class GenerationConfig(PushToHubMixin):
else: else:
return config return config
def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None: def dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
""" """
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None, Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"* converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
@ -1135,13 +1135,13 @@ class GenerationConfig(PushToHubMixin):
if isinstance(value, dict): if isinstance(value, dict):
self.dict_torch_dtype_to_str(value) self.dict_torch_dtype_to_str(value)
def to_diff_dict(self) -> Dict[str, Any]: def to_diff_dict(self) -> dict[str, Any]:
""" """
Removes all attributes from config which correspond to the default config attributes for better readability and Removes all attributes from config which correspond to the default config attributes for better readability and
serializes to a Python dictionary. serializes to a Python dictionary.
Returns: Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
""" """
config_dict = self.to_dict() config_dict = self.to_dict()
@ -1158,12 +1158,12 @@ class GenerationConfig(PushToHubMixin):
self.dict_torch_dtype_to_str(serializable_config_dict) self.dict_torch_dtype_to_str(serializable_config_dict)
return serializable_config_dict return serializable_config_dict
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> dict[str, Any]:
""" """
Serializes this instance to a Python dictionary. Serializes this instance to a Python dictionary.
Returns: Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
@ -1289,11 +1289,11 @@ class GenerationConfig(PushToHubMixin):
returning all the unused kwargs. returning all the unused kwargs.
Args: Args:
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Dictionary of attributes to tentatively update this class. Dictionary of attributes to tentatively update this class.
Returns: Returns:
`Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance. `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
""" """
to_remove = [] to_remove = []
for key, value in kwargs.items(): for key, value in kwargs.items():
@ -1319,7 +1319,7 @@ class BaseWatermarkingConfig(ABC):
Constructs a BaseWatermarkingConfig instance from a dictionary of parameters. Constructs a BaseWatermarkingConfig instance from a dictionary of parameters.
Args: Args:
config_dict (Dict[str, Any]): Dictionary containing configuration parameters. config_dict (dict[str, Any]): Dictionary containing configuration parameters.
**kwargs: Additional keyword arguments to override dictionary values. **kwargs: Additional keyword arguments to override dictionary values.
Returns: Returns:
@ -1348,12 +1348,12 @@ class BaseWatermarkingConfig(ABC):
writer.write(json_string) writer.write(json_string)
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> dict[str, Any]:
""" """
Serializes this instance to a Python dictionary. Serializes this instance to a Python dictionary.
Returns: Returns:
Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance. dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
return output return output
@ -1479,7 +1479,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
Args: Args:
ngram_len (`int`): ngram_len (`int`):
Ngram length. Ngram length.
keys (`List[int]`): keys (`list[int]`):
A sequence of watermarking keys, one for each depth. A sequence of watermarking keys, one for each depth.
context_history_size (`int`, *optional*, defaults to 1024): context_history_size (`int`, *optional*, defaults to 1024):
Size of the tensor to keep track of seen contexts. Size of the tensor to keep track of seen contexts.
@ -1518,7 +1518,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
def __init__( def __init__(
self, self,
ngram_len: int, ngram_len: int,
keys: List[int], keys: list[int],
context_history_size: int = 1024, context_history_size: int = 1024,
sampling_table_seed: int = 0, sampling_table_seed: int = 0,
sampling_table_size: int = 2**16, sampling_table_size: int = 2**16,
@ -1605,6 +1605,6 @@ class CompileConfig:
# Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes. # Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
_compile_all_devices = None _compile_all_devices = None
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> dict[str, Any]:
"""Serializes this instance to a Python dictionary.""" """Serializes this instance to a Python dictionary."""
return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"}) return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})

View File

@ -23,7 +23,7 @@ from collections import deque
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from functools import partial from functools import partial
from typing import Deque, Dict, List, Optional, Set, Tuple, Union from typing import Optional, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
@ -59,16 +59,16 @@ class GenerationOutput:
Attributes: Attributes:
request_id (str): The ID of the generation request. request_id (str): The ID of the generation request.
prompt_ids (List[int]): The IDs of the prompt tokens. prompt_ids (list[int]): The IDs of the prompt tokens.
generated_tokens (List[int]): The generated tokens. generated_tokens (list[int]): The generated tokens.
logprobs (List[float]): The log probabilities of the generated tokens. logprobs (list[float]): The log probabilities of the generated tokens.
error (Optional[str]): Any error message associated with the request. When None, the request was successful. error (Optional[str]): Any error message associated with the request. When None, the request was successful.
""" """
request_id: str request_id: str
prompt_ids: List[int] = field(default_factory=list) prompt_ids: list[int] = field(default_factory=list)
generated_tokens: List[int] = field(default_factory=list) generated_tokens: list[int] = field(default_factory=list)
logprobs: List[float] = field(default_factory=list) logprobs: list[float] = field(default_factory=list)
error: Optional[str] = None error: Optional[str] = None
status: RequestStatus = RequestStatus.PENDING status: RequestStatus = RequestStatus.PENDING
created_time: float = field(default_factory=time.time) created_time: float = field(default_factory=time.time)
@ -85,11 +85,11 @@ class RequestState:
# Required fields # Required fields
request_id: str request_id: str
prompt_ids: Optional[List[int]] = None # the one being processed prompt_ids: Optional[list[int]] = None # the one being processed
full_prompt_ids: Optional[List[int]] = None # the full prompt full_prompt_ids: Optional[list[int]] = None # the full prompt
remaining_prompt_ids: List[int] = field(default_factory=list) # For split requests remaining_prompt_ids: list[int] = field(default_factory=list) # For split requests
static_outputs: List[int] = field(default_factory=list) static_outputs: list[int] = field(default_factory=list)
allocated_blocks: List[int] = field(default_factory=list) allocated_blocks: list[int] = field(default_factory=list)
position_offset: int = 0 # Current position in the sequence for position_ids position_offset: int = 0 # Current position in the sequence for position_ids
status: RequestStatus = RequestStatus.PENDING status: RequestStatus = RequestStatus.PENDING
max_new_tokens: int = 20 max_new_tokens: int = 20
@ -150,8 +150,8 @@ class PagedAttentionCache(Cache):
generation_config: GenerationConfig, generation_config: GenerationConfig,
device: torch.device, device: torch.device,
dtype: torch.dtype = torch.float16, dtype: torch.dtype = torch.float16,
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
initial_prompt_shapes: Optional[List[List[int]]] = None, initial_prompt_shapes: Optional[list[list[int]]] = None,
) -> None: ) -> None:
"""Initialize a paged attention cache for efficient memory usage. """Initialize a paged attention cache for efficient memory usage.
@ -191,8 +191,8 @@ class PagedAttentionCache(Cache):
self.dtype = dtype self.dtype = dtype
self.device = device self.device = device
self.key_cache: List[torch.Tensor] = [] self.key_cache: list[torch.Tensor] = []
self.value_cache: List[torch.Tensor] = [] self.value_cache: list[torch.Tensor] = []
for idx in range(config.num_hidden_layers): for idx in range(config.num_hidden_layers):
layer_device = layer_device_map[idx] if layer_device_map is not None else device layer_device = layer_device_map[idx] if layer_device_map is not None else device
new_layer_key_cache = torch.zeros(self.cache_shape, dtype=self.dtype, device=layer_device) new_layer_key_cache = torch.zeros(self.cache_shape, dtype=self.dtype, device=layer_device)
@ -206,10 +206,10 @@ class PagedAttentionCache(Cache):
# Block management data structures # Block management data structures
self._free_blocks = deque(range(num_blocks)) self._free_blocks = deque(range(num_blocks))
self._block_tables: Dict[str, List[int]] = {} self._block_tables: dict[str, list[int]] = {}
@traced @traced
def allocate_blocks(self, n_blocks: int, request_id: str) -> List[int]: def allocate_blocks(self, n_blocks: int, request_id: str) -> list[int]:
"""Allocates n_blocks for a given request_id.""" """Allocates n_blocks for a given request_id."""
if len(self._free_blocks) < n_blocks: if len(self._free_blocks) < n_blocks:
return False return False
@ -236,12 +236,12 @@ class PagedAttentionCache(Cache):
"""Returns the number of free blocks available.""" """Returns the number of free blocks available."""
return len(self._free_blocks) return len(self._free_blocks)
def get_block_table(self, request_id: str) -> List[int]: def get_block_table(self, request_id: str) -> list[int]:
"""Returns the block table for a request.""" """Returns the block table for a request."""
return self._block_tables.get(request_id, []) return self._block_tables.get(request_id, [])
@traced @traced
def _get_physical_indices(self, state: RequestState, logical_indices: List[int]) -> List[int]: def _get_physical_indices(self, state: RequestState, logical_indices: list[int]) -> list[int]:
""" """
Maps logical sequence indices to physical cache indices using the block table, using PyTorch. Maps logical sequence indices to physical cache indices using the block table, using PyTorch.
@ -289,7 +289,7 @@ class PagedAttentionCache(Cache):
read_index, read_index,
write_index, write_index,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# Reshape cache for easier indexing # Reshape cache for easier indexing
total_slots = self.num_blocks * self.block_size total_slots = self.num_blocks * self.block_size
k_cache_flat = self.key_cache[layer_idx].view(self.num_key_value_heads, total_slots, self.head_dim) k_cache_flat = self.key_cache[layer_idx].view(self.num_key_value_heads, total_slots, self.head_dim)
@ -306,9 +306,9 @@ class Scheduler(ABC):
""" """
def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False): def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False):
self.active_requests: Dict[str, RequestState] = {} self.active_requests: dict[str, RequestState] = {}
self.waiting_requests: Dict[str, RequestState] = {} self.waiting_requests: dict[str, RequestState] = {}
self.waiting_requests_order: Deque[str] = deque() self.waiting_requests_order: deque[str] = deque()
self.cache = cache self.cache = cache
self.retain_cache_on_finish = retain_cache_on_finish self.retain_cache_on_finish = retain_cache_on_finish
@ -318,7 +318,7 @@ class Scheduler(ABC):
pass pass
@abstractmethod @abstractmethod
def schedule_batch(self, token_budget: int) -> List[RequestState]: def schedule_batch(self, token_budget: int) -> list[RequestState]:
pass pass
@traced @traced
@ -332,7 +332,7 @@ class Scheduler(ABC):
pass pass
@traced @traced
def get_active_request_static_outputs(self, request_id: str) -> List[int]: def get_active_request_static_outputs(self, request_id: str) -> list[int]:
if request_id in self.active_requests: if request_id in self.active_requests:
return self.active_requests[request_id].static_outputs return self.active_requests[request_id].static_outputs
return [] return []
@ -356,7 +356,7 @@ class FIFOScheduler(Scheduler):
@traced(span_name="prepare_request") @traced(span_name="prepare_request")
def _prepare_request_for_processing( def _prepare_request_for_processing(
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str] self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
): ):
"""Prepare a request for processing in the current batch.""" """Prepare a request for processing in the current batch."""
request_tokens = ( request_tokens = (
@ -395,9 +395,9 @@ class FIFOScheduler(Scheduler):
self.waiting_requests_order.append(state.request_id) self.waiting_requests_order.append(state.request_id)
@traced @traced
def schedule_batch(self, token_budget: int) -> List[RequestState]: def schedule_batch(self, token_budget: int) -> list[RequestState]:
priority_states: List[RequestState] = [] priority_states: list[RequestState] = []
second_priority_states: List[RequestState] = [] second_priority_states: list[RequestState] = []
scheduled_requests = [] scheduled_requests = []
for state in self.active_requests.values(): for state in self.active_requests.values():
@ -475,7 +475,7 @@ class PrefillFirstScheduler(Scheduler):
@traced(span_name="prepare_request") @traced(span_name="prepare_request")
def _prepare_request_for_processing( def _prepare_request_for_processing(
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str] self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
): ):
"""Prepare a request for processing in the current batch.""" """Prepare a request for processing in the current batch."""
request_tokens = ( request_tokens = (
@ -514,9 +514,9 @@ class PrefillFirstScheduler(Scheduler):
self.waiting_requests_order.append(state.request_id) self.waiting_requests_order.append(state.request_id)
@traced @traced
def schedule_batch(self, token_budget: int) -> List[RequestState]: def schedule_batch(self, token_budget: int) -> list[RequestState]:
priority_states: List[RequestState] = [] priority_states: list[RequestState] = []
second_priority_states: List[RequestState] = [] second_priority_states: list[RequestState] = []
scheduled_requests = [] scheduled_requests = []
for state in self.active_requests.values(): for state in self.active_requests.values():
@ -581,7 +581,7 @@ def compute_optimal_blocks(
device: torch.device, device: torch.device,
config: PretrainedConfig, config: PretrainedConfig,
generation_config: GenerationConfig, generation_config: GenerationConfig,
inputs: List[List[int]], inputs: list[list[int]],
dtype: torch.dtype = torch.bfloat16, dtype: torch.dtype = torch.bfloat16,
safety_margin: float = 0.9, safety_margin: float = 0.9,
median_prefill_length: Optional[int] = None, median_prefill_length: Optional[int] = None,
@ -678,7 +678,7 @@ class PagedAttentionArgs:
write_index: torch.Tensor write_index: torch.Tensor
read_index: torch.Tensor read_index: torch.Tensor
logits_indices: torch.Tensor logits_indices: torch.Tensor
block_tables: Dict[str, List[int]] block_tables: dict[str, list[int]]
cache: PagedAttentionCache cache: PagedAttentionCache
use_cache: bool = False use_cache: bool = False
@ -754,7 +754,7 @@ class ContinuousBatchProcessor:
self.streaming = streaming self.streaming = streaming
self.manual_eviction = manual_eviction self.manual_eviction = manual_eviction
self.requests_in_batch: List[RequestState] = [] self.requests_in_batch: list[RequestState] = []
# Get batch size parameters from generation config # Get batch size parameters from generation config
self._configure_batch_parameters() self._configure_batch_parameters()
@ -1152,7 +1152,7 @@ class ContinuousBatchingManager:
self._generation_thread = None self._generation_thread = None
def add_request( def add_request(
self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None self, input_ids: list[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
) -> str: ) -> str:
"""Add a new generation request to the queue. """Add a new generation request to the queue.
@ -1184,7 +1184,7 @@ class ContinuousBatchingManager:
logger.debug(f"Added request {request_id} to queue.") logger.debug(f"Added request {request_id} to queue.")
return request_id return request_id
def add_requests(self, inputs: List[List[int]], **kwargs): def add_requests(self, inputs: list[list[int]], **kwargs):
for i, input_ids in enumerate(inputs): for i, input_ids in enumerate(inputs):
# Assign a predictable request ID for ordering results later # Assign a predictable request ID for ordering results later
req_id = f"batch_req_{i}" req_id = f"batch_req_{i}"
@ -1428,11 +1428,11 @@ class ContinuousMixin:
@torch.inference_mode() @torch.inference_mode()
def generate_batch( def generate_batch(
self, self,
inputs: List[List[int]], inputs: list[list[int]],
generation_config: Optional[GenerationConfig] = None, generation_config: Optional[GenerationConfig] = None,
progress_bar: bool = True, progress_bar: bool = True,
**kwargs, **kwargs,
) -> List[List[int]]: ) -> list[list[int]]:
"""Generate sequences for a batch of prompts using continuous batching. """Generate sequences for a batch of prompts using continuous batching.
Args: Args:
@ -1441,7 +1441,7 @@ class ContinuousMixin:
**kwargs: Additional generation parameters **kwargs: Additional generation parameters
Returns: Returns:
`List[List[int]]`: A list containing the generated sequences (including prompt tokens `list[list[int]]`: A list containing the generated sequences (including prompt tokens
if not handled otherwise) for each input prompt, in the same order. if not handled otherwise) for each input prompt, in the same order.
Returns an empty list `[]` for requests that failed. Returns an empty list `[]` for requests that failed.
""" """

View File

@ -39,7 +39,7 @@ LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`): scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
search or log softmax for each vocabulary token when using beam search search or log softmax for each vocabulary token when using beam search
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional logits processor specific kwargs. Additional logits processor specific kwargs.
Return: Return:
@ -276,7 +276,7 @@ class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
beginning of the generation. beginning of the generation.
Args: Args:
begin_suppress_tokens (`List[int]`): begin_suppress_tokens (`list[int]`):
Tokens to not sample. Tokens to not sample.
begin_index (`int`): begin_index (`int`):
Index where the tokens are suppressed. Index where the tokens are suppressed.

View File

@ -19,7 +19,7 @@ import copy
import inspect import inspect
import warnings import warnings
from functools import partial from functools import partial
from typing import Any, Dict, Optional, Union from typing import Any, Optional, Union
import flax import flax
import jax import jax
@ -103,7 +103,7 @@ class GreedyState:
sequences: jnp.ndarray sequences: jnp.ndarray
running_token: jnp.ndarray running_token: jnp.ndarray
is_sent_finished: jnp.ndarray is_sent_finished: jnp.ndarray
model_kwargs: Dict[str, jnp.ndarray] model_kwargs: dict[str, jnp.ndarray]
@flax.struct.dataclass @flax.struct.dataclass
@ -113,7 +113,7 @@ class SampleState:
running_token: jnp.ndarray running_token: jnp.ndarray
is_sent_finished: jnp.ndarray is_sent_finished: jnp.ndarray
prng_key: jnp.ndarray prng_key: jnp.ndarray
model_kwargs: Dict[str, jnp.ndarray] model_kwargs: dict[str, jnp.ndarray]
@flax.struct.dataclass @flax.struct.dataclass
@ -124,7 +124,7 @@ class BeamSearchState:
sequences: jnp.ndarray sequences: jnp.ndarray
scores: jnp.ndarray scores: jnp.ndarray
is_sent_finished: jnp.ndarray is_sent_finished: jnp.ndarray
model_kwargs: Dict[str, jnp.ndarray] model_kwargs: dict[str, jnp.ndarray]
class FlaxGenerationMixin: class FlaxGenerationMixin:
@ -173,7 +173,7 @@ class FlaxGenerationMixin:
batch_size: int, batch_size: int,
decoder_start_token_id: Optional[int] = None, decoder_start_token_id: Optional[int] = None,
bos_token_id: Optional[int] = None, bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
) -> jnp.ndarray: ) -> jnp.ndarray:
if model_kwargs is not None and "decoder_input_ids" in model_kwargs: if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
# Only use this arg if not None, otherwise just remove from model_kwargs # Only use this arg if not None, otherwise just remove from model_kwargs
@ -249,7 +249,7 @@ class FlaxGenerationMixin:
exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}" exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
raise TypeError(exception_message) raise TypeError(exception_message)
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
"""Validates model kwargs for generation. Generate argument typos will also be caught here.""" """Validates model kwargs for generation. Generate argument typos will also be caught here."""
unused_model_args = [] unused_model_args = []
model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters) model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
@ -273,7 +273,7 @@ class FlaxGenerationMixin:
generation_config: Optional[GenerationConfig] = None, generation_config: Optional[GenerationConfig] = None,
prng_key: Optional[jnp.ndarray] = None, prng_key: Optional[jnp.ndarray] = None,
trace: bool = True, trace: bool = True,
params: Optional[Dict[str, jnp.ndarray]] = None, params: Optional[dict[str, jnp.ndarray]] = None,
logits_processor: Optional[FlaxLogitsProcessorList] = None, logits_processor: Optional[FlaxLogitsProcessorList] = None,
**kwargs, **kwargs,
): ):
@ -293,13 +293,13 @@ class FlaxGenerationMixin:
trace (`bool`, *optional*, defaults to `True`): trace (`bool`, *optional*, defaults to `True`):
Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
considerably slower runtime. considerably slower runtime.
params (`Dict[str, jnp.ndarray]`, *optional*): params (`dict[str, jnp.ndarray]`, *optional*):
Optionally the model parameters can be passed. Can be useful for parallelized generation. Optionally the model parameters can be passed. Can be useful for parallelized generation.
logits_processor (`FlaxLogitsProcessorList `, *optional*): logits_processor (`FlaxLogitsProcessorList `, *optional*):
Custom logits processors that complement the default logits processors built from arguments and Custom logits processors that complement the default logits processors built from arguments and
generation config. If a logit processor is passed that is already created with the arguments or a generation config. If a logit processor is passed that is already created with the arguments or a
generation config an error is thrown. This feature is intended for advanced users. generation config an error is thrown. This feature is intended for advanced users.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -580,8 +580,8 @@ class FlaxGenerationMixin:
eos_token_id: Optional[int] = None, eos_token_id: Optional[int] = None,
logits_processor: Optional[FlaxLogitsProcessorList] = None, logits_processor: Optional[FlaxLogitsProcessorList] = None,
trace: bool = True, trace: bool = True,
params: Optional[Dict[str, jnp.ndarray]] = None, params: Optional[dict[str, jnp.ndarray]] = None,
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
): ):
# init values # init values
max_length = max_length if max_length is not None else self.generation_config.max_length max_length = max_length if max_length is not None else self.generation_config.max_length
@ -668,8 +668,8 @@ class FlaxGenerationMixin:
logits_processor: Optional[FlaxLogitsProcessorList] = None, logits_processor: Optional[FlaxLogitsProcessorList] = None,
logits_warper: Optional[FlaxLogitsProcessorList] = None, logits_warper: Optional[FlaxLogitsProcessorList] = None,
trace: bool = True, trace: bool = True,
params: Optional[Dict[str, jnp.ndarray]] = None, params: Optional[dict[str, jnp.ndarray]] = None,
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
): ):
# init values # init values
max_length = max_length if max_length is not None else self.generation_config.max_length max_length = max_length if max_length is not None else self.generation_config.max_length
@ -765,9 +765,9 @@ class FlaxGenerationMixin:
early_stopping: Optional[Union[bool, str]] = None, early_stopping: Optional[Union[bool, str]] = None,
logits_processor: Optional[FlaxLogitsProcessorList] = None, logits_processor: Optional[FlaxLogitsProcessorList] = None,
trace: bool = True, trace: bool = True,
params: Optional[Dict[str, jnp.ndarray]] = None, params: Optional[dict[str, jnp.ndarray]] = None,
num_return_sequences: Optional[int] = None, num_return_sequences: Optional[int] = None,
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
): ):
""" """
This beam search function is heavily inspired by Flax's official example: This beam search function is heavily inspired by Flax's official example:

View File

@ -16,7 +16,7 @@
import inspect import inspect
import math import math
from collections.abc import Iterable from collections.abc import Iterable
from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Callable, Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -72,7 +72,7 @@ class LogitsProcessorList(list):
scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`): scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
beam search or log softmax for each vocabulary token when using beam search beam search or log softmax for each vocabulary token when using beam search
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional kwargs that are specific to a logits processor. Additional kwargs that are specific to a logits processor.
Return: Return:
@ -103,7 +103,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
Args: Args:
min_length (`int`): min_length (`int`):
The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`. The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
device (`str`, *optional*, defaults to `"cpu"`): device (`str`, *optional*, defaults to `"cpu"`):
The device to allocate the tensors. The device to allocate the tensors.
@ -134,7 +134,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
``` ```
""" """
def __init__(self, min_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"): def __init__(self, min_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
if not isinstance(min_length, int) or min_length < 0: if not isinstance(min_length, int) or min_length < 0:
raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}") raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
@ -167,7 +167,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
input length. input length.
min_new_tokens (`int`): min_new_tokens (`int`):
The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`. The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
device (`str`, *optional*, defaults to `"cpu"`): device (`str`, *optional*, defaults to `"cpu"`):
The device to allocate the tensors. The device to allocate the tensors.
@ -197,7 +197,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
self, self,
prompt_length_to_skip: int, prompt_length_to_skip: int,
min_new_tokens: int, min_new_tokens: int,
eos_token_id: Union[int, List[int], torch.Tensor], eos_token_id: Union[int, list[int], torch.Tensor],
device: str = "cpu", device: str = "cpu",
): ):
for arg_name, arg_value in [ for arg_name, arg_value in [
@ -917,7 +917,7 @@ def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
def _calc_banned_ngram_tokens( def _calc_banned_ngram_tokens(
ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int
) -> List[Iterable[int]]: ) -> list[Iterable[int]]:
"""Copied from fairseq for no_repeat_ngram in beam_search""" """Copied from fairseq for no_repeat_ngram in beam_search"""
if cur_len + 1 < ngram_size: if cur_len + 1 < ngram_size:
# return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
@ -1074,7 +1074,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
</Tip> </Tip>
Args: Args:
sequence_bias (`List[List[Union[List[int], float]]]`): sequence_bias (`list[list[Union[list[int], float]]]`):
List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0], List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0],
[[64], -7.5]]`). Positive biases increase the odds of the [[64], -7.5]]`). Positive biases increase the odds of the
sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
@ -1123,7 +1123,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
``` ```
""" """
def __init__(self, sequence_bias: List[List[Union[List[int], float]]]): def __init__(self, sequence_bias: list[list[Union[list[int], float]]]):
self.sequence_bias = sequence_bias self.sequence_bias = sequence_bias
self._validate_arguments() self._validate_arguments()
self._convert_list_arguments_into_dict() self._convert_list_arguments_into_dict()
@ -1250,9 +1250,9 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
</Tip> </Tip>
Args: Args:
bad_words_ids (`List[List[int]]`): bad_words_ids (`list[list[int]]`):
List of list of token ids that are not allowed to be generated. List of list of token ids that are not allowed to be generated.
eos_token_id (`Union[int, List[int], torch.Tensor]`, *optional*): eos_token_id (`Union[int, list[int], torch.Tensor]`, *optional*):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
Examples: Examples:
@ -1291,7 +1291,7 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
""" """
def __init__( def __init__(
self, bad_words_ids: List[List[int]], eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None self, bad_words_ids: list[list[int]], eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None
): ):
self.bad_word_ids = bad_words_ids self.bad_word_ids = bad_words_ids
self._validate_arguments() self._validate_arguments()
@ -1332,7 +1332,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
generation. See [Autoregressive Entity Retrieval](https://huggingface.co/papers/2010.00904) for more information. generation. See [Autoregressive Entity Retrieval](https://huggingface.co/papers/2010.00904) for more information.
Args: Args:
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`): prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`):
This function constraints the beam search to allowed tokens only at each step. This function takes 2 This function constraints the beam search to allowed tokens only at each step. This function takes 2
arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
@ -1373,7 +1373,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
``` ```
""" """
def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int): def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]], num_beams: int):
self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
self._num_beams = num_beams self._num_beams = num_beams
@ -1586,7 +1586,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
Args: Args:
max_length (`int`): max_length (`int`):
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
device (`str`, *optional*, defaults to `"cpu"`): device (`str`, *optional*, defaults to `"cpu"`):
The device to allocate the tensors. The device to allocate the tensors.
@ -1613,7 +1613,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
``` ```
""" """
def __init__(self, max_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"): def __init__(self, max_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
self.max_length = max_length self.max_length = max_length
if not isinstance(eos_token_id, torch.Tensor): if not isinstance(eos_token_id, torch.Tensor):
@ -1666,7 +1666,7 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
exponential_decay_length_penalty (`tuple(int, float)`): exponential_decay_length_penalty (`tuple(int, float)`):
This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty
starts and `decay_factor` represents the factor of exponential decay starts and `decay_factor` represents the factor of exponential decay
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
input_ids_seq_length (`int`): input_ids_seq_length (`int`):
The length of the input sequence. The length of the input sequence.
@ -1726,8 +1726,8 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
def __init__( def __init__(
self, self,
exponential_decay_length_penalty: Tuple[int, float], exponential_decay_length_penalty: tuple[int, float],
eos_token_id: Union[int, List[int], torch.Tensor], eos_token_id: Union[int, list[int], torch.Tensor],
input_ids_seq_length: int, input_ids_seq_length: int,
): ):
self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
@ -2326,13 +2326,13 @@ class BarkEosPrioritizerLogitsProcessor(LogitsProcessor):
</Tip> </Tip>
Args: Args:
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
min_eos_p (`float`, *optional*): min_eos_p (`float`, *optional*):
Minimum end of speech threshold. Minimum end of speech threshold.
""" """
def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor], min_eos_p: float, device: str = "cpu"): def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
if not isinstance(eos_token_id, torch.Tensor): if not isinstance(eos_token_id, torch.Tensor):
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]
@ -2569,7 +2569,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
Args: Args:
ngram_len (`int`): ngram_len (`int`):
Ngram length. Ngram length.
keys (`List[int]`): keys (`list[int]`):
A sequence of watermarking keys, one for each depth. A sequence of watermarking keys, one for each depth.
sampling_table_size (`int`): sampling_table_size (`int`):
Size of the sampling table. Size of the sampling table.
@ -2610,7 +2610,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
def __init__( def __init__(
self, self,
ngram_len: int, ngram_len: int,
keys: List[int], keys: list[int],
sampling_table_size: int, sampling_table_size: int,
sampling_table_seed: int, sampling_table_seed: int,
context_history_size: int, context_history_size: int,
@ -2808,7 +2808,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
def _compute_keys( def _compute_keys(
self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor
) -> Tuple[torch.LongTensor, torch.LongTensor]: ) -> tuple[torch.LongTensor, torch.LongTensor]:
"""Computes random keys for each ngram and depth. """Computes random keys for each ngram and depth.
Args: Args:

View File

@ -3,7 +3,7 @@ import warnings
from abc import ABC from abc import ABC
from collections import OrderedDict from collections import OrderedDict
from copy import deepcopy from copy import deepcopy
from typing import Dict, List, Optional, Tuple, Union from typing import Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -33,7 +33,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input, or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional stopping criteria specific kwargs. Additional stopping criteria specific kwargs.
Return: Return:
@ -209,7 +209,7 @@ class StopStringCriteria(StoppingCriteria):
Args: Args:
tokenizer (`PreTrainedTokenizer`): tokenizer (`PreTrainedTokenizer`):
The model's associated tokenizer (necessary to extract vocab and tokenize the termination sequences) The model's associated tokenizer (necessary to extract vocab and tokenize the termination sequences)
stop_strings (`Union[str, List[str]]`): stop_strings (`Union[str, list[str]]`):
A list of strings that should end generation. If a string is passed, it will be treated like a A list of strings that should end generation. If a string is passed, it will be treated like a
list with a single element. list with a single element.
@ -239,10 +239,10 @@ class StopStringCriteria(StoppingCriteria):
``` ```
""" """
def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, List[str]]): def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, list[str]]):
if isinstance(stop_strings, str): if isinstance(stop_strings, str):
stop_strings = [stop_strings] stop_strings = [stop_strings]
self.stop_strings: Tuple[str, ...] = tuple(stop_strings) self.stop_strings: tuple[str, ...] = tuple(stop_strings)
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()
token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values()) token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache( self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
@ -298,7 +298,7 @@ class StopStringCriteria(StoppingCriteria):
@staticmethod @staticmethod
def _stop_string_get_matching_positions( def _stop_string_get_matching_positions(
token_list, token_indices, stop_strings token_list, token_indices, stop_strings
) -> Tuple[Dict[str, Dict[str, List[int]]], Dict[str, Dict[str, List[int]]]]: ) -> tuple[dict[str, dict[str, list[int]]], dict[str, dict[str, list[int]]]]:
"""This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can """This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can
validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the
token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters
@ -337,7 +337,7 @@ class StopStringCriteria(StoppingCriteria):
return token_valid_positions, token_end_overlaps return token_valid_positions, token_end_overlaps
@staticmethod @staticmethod
def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> Dict[str, torch.tensor]: def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> dict[str, torch.tensor]:
"""This function precomputes everything needed for the run-time checks in StopStringCriteria, and packs """This function precomputes everything needed for the run-time checks in StopStringCriteria, and packs
them into an embedding tensor that can be accessed with pure tensor operations. For the specifics of the values them into an embedding tensor that can be accessed with pure tensor operations. For the specifics of the values
that are precomputed and what they are used for, please refer to the StopStringCriteria docstring!""" that are precomputed and what they are used for, please refer to the StopStringCriteria docstring!"""
@ -455,11 +455,11 @@ class EosTokenCriteria(StoppingCriteria):
By default, it uses the `model.generation_config.eos_token_id`. By default, it uses the `model.generation_config.eos_token_id`.
Args: Args:
eos_token_id (`Union[int, List[int], torch.Tensor]`): eos_token_id (`Union[int, list[int], torch.Tensor]`):
The id(s) of the *end-of-sequence* token. The id(s) of the *end-of-sequence* token.
""" """
def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor]): def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor]):
if not isinstance(eos_token_id, torch.Tensor): if not isinstance(eos_token_id, torch.Tensor):
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]

View File

@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
import inspect import inspect
from typing import List, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@ -42,7 +41,7 @@ TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
cur_len (`int`): cur_len (`int`):
The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
is the maximum length generate can produce, and we need to know which of its tokens are valid. is the maximum length generate can produce, and we need to know which of its tokens are valid.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional logits processor specific kwargs. Additional logits processor specific kwargs.
Return: Return:
@ -290,7 +289,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
[`TFLogitsProcessor`] that enforces that specified sequences will never be sampled. [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
Args: Args:
bad_words_ids (`List[List[int]]`): bad_words_ids (`list[list[int]]`):
List of list of token ids that are not allowed to be generated. In order to get the tokens of the words List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space` the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
@ -300,8 +299,8 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
The id of the *end-of-sequence* token. The id of the *end-of-sequence* token.
""" """
def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): def __init__(self, bad_words_ids: list[list[int]], eos_token_id: int):
if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0: if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.") raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids): if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.") raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
@ -370,7 +369,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
# To remain simple and XLA-compatible, we work on a per-row fashion. # To remain simple and XLA-compatible, we work on a per-row fashion.
# TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes # TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
# a frequent choke point. (make `cur_len` a tensor?) # a frequent choke point. (make `cur_len` a tensor?)
def _get_row_updated_score(row_inputs: Tuple[tf.Tensor]) -> tf.Tensor: def _get_row_updated_score(row_inputs: tuple[tf.Tensor]) -> tf.Tensor:
row_input_ids, row_score = row_inputs row_input_ids, row_score = row_inputs
banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len]) banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
banned_tokens_mask = tf.scatter_nd( banned_tokens_mask = tf.scatter_nd(
@ -565,7 +564,7 @@ class TFForceTokensLogitsProcessor(TFLogitsProcessor):
indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
`-inf` so that they are sampled at their corresponding index.""" `-inf` so that they are sampled at their corresponding index."""
def __init__(self, force_token_map: List[List[int]]): def __init__(self, force_token_map: list[list[int]]):
force_token_map = dict(force_token_map) force_token_map = dict(force_token_map)
# Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
# index of the array corresponds to the index of the token to be forced, for XLA compatibility. # index of the array corresponds to the index of the token to be forced, for XLA compatibility.

View File

@ -18,7 +18,7 @@ import copy
import inspect import inspect
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union from typing import Any, Optional, Union
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@ -77,9 +77,9 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None attentions: Optional[tuple[tuple[tf.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -116,12 +116,12 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None encoder_attentions: Optional[tuple[tf.Tensor]] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -147,9 +147,9 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None attentions: Optional[tuple[tuple[tf.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -186,12 +186,12 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None encoder_attentions: Optional[tuple[tf.Tensor]] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -223,10 +223,10 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
sequences_scores: Optional[tf.Tensor] = None sequences_scores: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
beam_indices: Optional[tf.Tensor] = None beam_indices: Optional[tf.Tensor] = None
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None attentions: Optional[tuple[tuple[tf.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -270,13 +270,13 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
sequences_scores: Optional[tf.Tensor] = None sequences_scores: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
beam_indices: Optional[tf.Tensor] = None beam_indices: Optional[tf.Tensor] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None encoder_attentions: Optional[tuple[tf.Tensor]] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -308,10 +308,10 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
sequences_scores: Optional[tf.Tensor] = None sequences_scores: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
beam_indices: Optional[tf.Tensor] = None beam_indices: Optional[tf.Tensor] = None
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None attentions: Optional[tuple[tuple[tf.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -354,13 +354,13 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
sequences_scores: Optional[tf.Tensor] = None sequences_scores: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
beam_indices: Optional[tf.Tensor] = None beam_indices: Optional[tf.Tensor] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None encoder_attentions: Optional[tuple[tf.Tensor]] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -385,9 +385,9 @@ class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None attentions: Optional[tuple[tuple[tf.Tensor]]] = None
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass @dataclass
@ -423,12 +423,12 @@ class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
""" """
sequences: Optional[tf.Tensor] = None sequences: Optional[tf.Tensor] = None
scores: Optional[Tuple[tf.Tensor]] = None scores: Optional[tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None encoder_attentions: Optional[tuple[tf.Tensor]] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput] TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput]
@ -477,7 +477,7 @@ class TFGenerationMixin:
def compute_transition_scores( def compute_transition_scores(
self, self,
sequences: tf.Tensor, sequences: tf.Tensor,
scores: Tuple[tf.Tensor], scores: tuple[tf.Tensor],
beam_indices: Optional[tf.Tensor] = None, beam_indices: Optional[tf.Tensor] = None,
normalize_logits: bool = False, normalize_logits: bool = False,
) -> tf.Tensor: ) -> tf.Tensor:
@ -619,7 +619,7 @@ class TFGenerationMixin:
exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}" exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
raise TypeError(exception_message) raise TypeError(exception_message)
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
"""Validates model kwargs for generation. Generate argument typos will also be caught here.""" """Validates model kwargs for generation. Generate argument typos will also be caught here."""
# Excludes arguments that are handled before calling any model function # Excludes arguments that are handled before calling any model function
if self.config.is_encoder_decoder: if self.config.is_encoder_decoder:
@ -681,10 +681,10 @@ class TFGenerationMixin:
Custom logits processors that complement the default logits processors built from arguments and Custom logits processors that complement the default logits processors built from arguments and
generation config. If a logit processor is passed that is already created with the arguments or a generation config. If a logit processor is passed that is already created with the arguments or a
generation config an error is thrown. This feature is intended for advanced users. generation config an error is thrown. This feature is intended for advanced users.
seed (`List[int]`, *optional*): seed (`list[int]`, *optional*):
Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
`seed` argument from stateless functions in `tf.random`. `seed` argument from stateless functions in `tf.random`.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -1044,7 +1044,7 @@ class TFGenerationMixin:
def _prepare_encoder_decoder_kwargs_for_generation( def _prepare_encoder_decoder_kwargs_for_generation(
self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
) -> Dict[str, Any]: ) -> dict[str, Any]:
# 1. get encoder and store encoder outputs # 1. get encoder and store encoder outputs
encoder = self.get_encoder() encoder = self.get_encoder()
@ -1076,10 +1076,10 @@ class TFGenerationMixin:
self, self,
batch_size: int, batch_size: int,
model_input_name: str, model_input_name: str,
model_kwargs: Dict[str, tf.Tensor], model_kwargs: dict[str, tf.Tensor],
decoder_start_token_id: Optional[int] = None, decoder_start_token_id: Optional[int] = None,
bos_token_id: Optional[int] = None, bos_token_id: Optional[int] = None,
) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]: ) -> tuple[tf.Tensor, dict[str, tf.Tensor]]:
"""Prepares `decoder_input_ids` for generation with encoder-decoder models""" """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
# 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming, # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
# we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input. # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
@ -1138,7 +1138,7 @@ class TFGenerationMixin:
input_ids: Optional[tf.Tensor] = None, input_ids: Optional[tf.Tensor] = None,
expand_in_new_axis: bool = False, expand_in_new_axis: bool = False,
**model_kwargs, **model_kwargs,
) -> Tuple[tf.Tensor, Dict[str, Any]]: ) -> tuple[tf.Tensor, dict[str, Any]]:
""" """
Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...], Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
@ -1174,8 +1174,8 @@ class TFGenerationMixin:
self, self,
inputs: Optional[tf.Tensor] = None, inputs: Optional[tf.Tensor] = None,
bos_token_id: Optional[int] = None, bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, tf.Tensor]] = None, model_kwargs: Optional[dict[str, tf.Tensor]] = None,
) -> Tuple[tf.Tensor, Optional[str], Dict[str, tf.Tensor]]: ) -> tuple[tf.Tensor, Optional[str], dict[str, tf.Tensor]]:
""" """
This function extracts the model-specific `inputs` for generation. This function extracts the model-specific `inputs` for generation.
""" """
@ -1240,7 +1240,7 @@ class TFGenerationMixin:
self, self,
inputs: Optional[tf.Tensor] = None, inputs: Optional[tf.Tensor] = None,
bos_token_id: Optional[int] = None, bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, tf.Tensor]] = None, model_kwargs: Optional[dict[str, tf.Tensor]] = None,
) -> tf.Tensor: ) -> tf.Tensor:
"""Initializes input ids for generation, if necessary.""" """Initializes input ids for generation, if necessary."""
if inputs is not None: if inputs is not None:
@ -1276,8 +1276,8 @@ class TFGenerationMixin:
return past_key_values return past_key_values
def _update_model_kwargs_for_generation( def _update_model_kwargs_for_generation(
self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False self, outputs: ModelOutput, model_kwargs: dict[str, Any], is_encoder_decoder: bool = False
) -> Dict[str, Any]: ) -> dict[str, Any]:
# update past_key_values # update past_key_values
model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs) model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs)
@ -1294,7 +1294,7 @@ class TFGenerationMixin:
def _update_model_kwargs_for_xla_generation( def _update_model_kwargs_for_xla_generation(
self, self,
model_outputs: ModelOutput, model_outputs: ModelOutput,
model_kwargs: Dict[str, Any], model_kwargs: dict[str, Any],
cur_len: int, cur_len: int,
max_length: int, max_length: int,
batch_size: int, batch_size: int,
@ -1550,7 +1550,7 @@ class TFGenerationMixin:
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`): output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@ -1794,7 +1794,7 @@ class TFGenerationMixin:
max_length: Optional[int] = None, max_length: Optional[int] = None,
pad_token_id: Optional[int] = None, pad_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None, eos_token_id: Optional[int] = None,
seed: Optional[Tuple[int, int]] = None, seed: Optional[tuple[int, int]] = None,
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None, output_scores: Optional[bool] = None,
@ -1818,9 +1818,9 @@ class TFGenerationMixin:
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
seed (`List[int]`, *optional*): seed (`list[int]`, *optional*):
Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
`seed` argument from stateless functions in `tf.random`. `seed` argument from stateless functions in `tf.random`.
output_attentions (`bool`, *optional*, defaults to `False`): output_attentions (`bool`, *optional*, defaults to `False`):
@ -2128,7 +2128,7 @@ class TFGenerationMixin:
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
length_penalty (`float`, *optional*, defaults to 1.0): length_penalty (`float`, *optional*, defaults to 1.0):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
@ -2719,7 +2719,7 @@ class TFGenerationMixin:
The maximum length of the sequence to be generated. The maximum length of the sequence to be generated.
pad_token_id (`int`, *optional*): pad_token_id (`int`, *optional*):
The id of the *padding* token. The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*): eos_token_id (`Union[int, list[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`): output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under Whether or not to return the attentions tensors of all attention layers. See `attentions` under

View File

@ -18,7 +18,7 @@ import inspect
import os import os
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Callable, Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -169,11 +169,11 @@ class GenerateDecoderOnlyOutput(ModelOutput):
""" """
sequences: torch.LongTensor sequences: torch.LongTensor
scores: Optional[Tuple[torch.FloatTensor]] = None scores: Optional[tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None logits: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass @dataclass
@ -214,14 +214,14 @@ class GenerateEncoderDecoderOutput(ModelOutput):
""" """
sequences: torch.LongTensor sequences: torch.LongTensor
scores: Optional[Tuple[torch.FloatTensor]] = None scores: Optional[tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None logits: Optional[tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass @dataclass
@ -260,12 +260,12 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
sequences: torch.LongTensor sequences: torch.LongTensor
sequences_scores: Optional[torch.FloatTensor] = None sequences_scores: Optional[torch.FloatTensor] = None
scores: Optional[Tuple[torch.FloatTensor]] = None scores: Optional[tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None logits: Optional[tuple[torch.FloatTensor]] = None
beam_indices: Optional[torch.LongTensor] = None beam_indices: Optional[torch.LongTensor] = None
attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass @dataclass
@ -314,15 +314,15 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
sequences: torch.LongTensor sequences: torch.LongTensor
sequences_scores: Optional[torch.FloatTensor] = None sequences_scores: Optional[torch.FloatTensor] = None
scores: Optional[Tuple[torch.FloatTensor]] = None scores: Optional[tuple[torch.FloatTensor]] = None
logits: Optional[Tuple[torch.FloatTensor]] = None logits: Optional[tuple[torch.FloatTensor]] = None
beam_indices: Optional[torch.LongTensor] = None beam_indices: Optional[torch.LongTensor] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
# TODO (joao): remove the equivalent classes and typing shortcuts below in v5 # TODO (joao): remove the equivalent classes and typing shortcuts below in v5
@ -457,7 +457,7 @@ class GenerationMixin(ContinuousMixin):
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
inputs_embeds: Optional[torch.FloatTensor], inputs_embeds: Optional[torch.FloatTensor],
cache_position: Optional[torch.LongTensor], cache_position: Optional[torch.LongTensor],
) -> Tuple[torch.FloatTensor, torch.LongTensor]: ) -> tuple[torch.FloatTensor, torch.LongTensor]:
""" """
Generic cache-dependent input preparation Generic cache-dependent input preparation
The code is put in a separate function to allow granular unit testing The code is put in a separate function to allow granular unit testing
@ -491,7 +491,7 @@ class GenerationMixin(ContinuousMixin):
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
inputs_embeds: Optional[torch.FloatTensor], inputs_embeds: Optional[torch.FloatTensor],
cache_position: Optional[torch.LongTensor], cache_position: Optional[torch.LongTensor],
) -> Tuple[torch.FloatTensor, torch.LongTensor]: ) -> tuple[torch.FloatTensor, torch.LongTensor]:
""" """
This method implements method ``_cache_dependant_input_preparation`` This method implements method ``_cache_dependant_input_preparation``
with :func:`torch.cond` to make it exportable with :func:`torch.export.export`. with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
@ -697,8 +697,8 @@ class GenerationMixin(ContinuousMixin):
self, self,
inputs: Optional[torch.Tensor] = None, inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[torch.Tensor] = None, bos_token_id: Optional[torch.Tensor] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None, model_kwargs: Optional[dict[str, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]: ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
""" """
This function extracts the model-specific `inputs` for generation. This function extracts the model-specific `inputs` for generation.
""" """
@ -761,7 +761,7 @@ class GenerationMixin(ContinuousMixin):
self, self,
inputs: Optional[torch.Tensor] = None, inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[torch.Tensor] = None, bos_token_id: Optional[torch.Tensor] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None, model_kwargs: Optional[dict[str, torch.Tensor]] = None,
) -> torch.LongTensor: ) -> torch.LongTensor:
"""Initializes input ids for generation, if necessary.""" """Initializes input ids for generation, if necessary."""
if inputs is not None: if inputs is not None:
@ -793,7 +793,7 @@ class GenerationMixin(ContinuousMixin):
self, self,
inputs_tensor: torch.Tensor, inputs_tensor: torch.Tensor,
generation_config: GenerationConfig, generation_config: GenerationConfig,
model_kwargs: Dict[str, Any], model_kwargs: dict[str, Any],
) -> torch.LongTensor: ) -> torch.LongTensor:
pad_token_id = generation_config._pad_token_tensor pad_token_id = generation_config._pad_token_tensor
eos_token_id = generation_config._eos_token_tensor eos_token_id = generation_config._eos_token_tensor
@ -831,7 +831,7 @@ class GenerationMixin(ContinuousMixin):
model_kwargs, model_kwargs,
model_input_name: Optional[str], model_input_name: Optional[str],
generation_config: GenerationConfig, generation_config: GenerationConfig,
) -> Dict[str, Any]: ) -> dict[str, Any]:
# 1. get encoder # 1. get encoder
encoder = self.get_encoder() encoder = self.get_encoder()
# Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
@ -870,10 +870,10 @@ class GenerationMixin(ContinuousMixin):
self, self,
batch_size: int, batch_size: int,
model_input_name: str, model_input_name: str,
model_kwargs: Dict[str, torch.Tensor], model_kwargs: dict[str, torch.Tensor],
decoder_start_token_id: torch.Tensor, decoder_start_token_id: torch.Tensor,
device: Optional[torch.device] = None, device: Optional[torch.device] = None,
) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]: ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
"""Prepares `decoder_input_ids` for generation with encoder-decoder models""" """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
# 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming, # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
# we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input. # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
@ -931,7 +931,7 @@ class GenerationMixin(ContinuousMixin):
is_encoder_decoder: bool = False, is_encoder_decoder: bool = False,
input_ids: Optional[torch.LongTensor] = None, input_ids: Optional[torch.LongTensor] = None,
**model_kwargs, **model_kwargs,
) -> Tuple[torch.LongTensor, Dict[str, Any]]: ) -> tuple[torch.LongTensor, dict[str, Any]]:
"""Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]""" """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
# Do not call torch.repeat_interleave if expand_size is 1 because it clones # Do not call torch.repeat_interleave if expand_size is 1 because it clones
# the input tensor and thus requires more memory although no change is applied # the input tensor and thus requires more memory although no change is applied
@ -963,10 +963,10 @@ class GenerationMixin(ContinuousMixin):
def _update_model_kwargs_for_generation( def _update_model_kwargs_for_generation(
self, self,
outputs: ModelOutput, outputs: ModelOutput,
model_kwargs: Dict[str, Any], model_kwargs: dict[str, Any],
is_encoder_decoder: bool = False, is_encoder_decoder: bool = False,
num_new_tokens: int = 1, num_new_tokens: int = 1,
) -> Dict[str, Any]: ) -> dict[str, Any]:
# update past_key_values keeping its naming used in model code # update past_key_values keeping its naming used in model code
for possible_cache_name in ALL_CACHE_NAMES: for possible_cache_name in ALL_CACHE_NAMES:
if possible_cache_name in outputs: if possible_cache_name in outputs:
@ -1024,7 +1024,7 @@ class GenerationMixin(ContinuousMixin):
logits_processor: LogitsProcessorList, logits_processor: LogitsProcessorList,
target_tokenizer: "PreTrainedTokenizerBase", target_tokenizer: "PreTrainedTokenizerBase",
assistant_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase",
model_kwargs: Dict, model_kwargs: dict,
) -> CandidateGenerator: ) -> CandidateGenerator:
""" """
Returns the candidate generator to be used in `assisted_generation` Returns the candidate generator to be used in `assisted_generation`
@ -1100,10 +1100,10 @@ class GenerationMixin(ContinuousMixin):
generation_config: GenerationConfig, generation_config: GenerationConfig,
input_ids_seq_length: Optional[int] = None, input_ids_seq_length: Optional[int] = None,
encoder_input_ids: torch.LongTensor = None, encoder_input_ids: torch.LongTensor = None,
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
device: Optional[str] = None, device: Optional[str] = None,
model_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[dict[str, Any]] = None,
negative_prompt_ids: Optional[torch.Tensor] = None, negative_prompt_ids: Optional[torch.Tensor] = None,
negative_prompt_attention_mask: Optional[torch.Tensor] = None, negative_prompt_attention_mask: Optional[torch.Tensor] = None,
) -> LogitsProcessorList: ) -> LogitsProcessorList:
@ -1403,7 +1403,7 @@ class GenerationMixin(ContinuousMixin):
def compute_transition_scores( def compute_transition_scores(
self, self,
sequences: torch.Tensor, sequences: torch.Tensor,
scores: Tuple[torch.Tensor], scores: tuple[torch.Tensor],
beam_indices: Optional[torch.Tensor] = None, beam_indices: Optional[torch.Tensor] = None,
normalize_logits: bool = False, normalize_logits: bool = False,
) -> torch.Tensor: ) -> torch.Tensor:
@ -1552,7 +1552,7 @@ class GenerationMixin(ContinuousMixin):
f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}." f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}."
) )
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
"""Validates model kwargs for generation. Generate argument typos will also be caught here.""" """Validates model kwargs for generation. Generate argument typos will also be caught here."""
# If a `Cache` instance is passed, checks whether the model is compatible with it # If a `Cache` instance is passed, checks whether the model is compatible with it
if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class: if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
@ -1709,8 +1709,8 @@ class GenerationMixin(ContinuousMixin):
return generation_config return generation_config
def _prepare_generation_config( def _prepare_generation_config(
self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Dict self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict
) -> Tuple[GenerationConfig, Dict]: ) -> tuple[GenerationConfig, dict]:
""" """
Prepares the base generation config, then applies any generation configuration options from kwargs. This Prepares the base generation config, then applies any generation configuration options from kwargs. This
function handles retrocompatibility with respect to configuration files. function handles retrocompatibility with respect to configuration files.
@ -1821,7 +1821,7 @@ class GenerationMixin(ContinuousMixin):
model_kwargs["cache_position"] = cache_position model_kwargs["cache_position"] = cache_position
return model_kwargs return model_kwargs
def _get_layer_device_map_for_cache_init(self) -> Optional[Dict[int, Union[str, int]]]: def _get_layer_device_map_for_cache_init(self) -> Optional[dict[int, Union[str, int]]]:
""" """
Returns the device map for each decoder layer, to allocate the cache on the right device. Returns the device map for each decoder layer, to allocate the cache on the right device.
Inspired from `dispatch_model` in accelerate. Inspired from `dispatch_model` in accelerate.
@ -1982,7 +1982,7 @@ class GenerationMixin(ContinuousMixin):
def _prepare_cache_for_generation( def _prepare_cache_for_generation(
self, self,
generation_config: GenerationConfig, generation_config: GenerationConfig,
model_kwargs: Dict, model_kwargs: dict,
assistant_model: "PreTrainedModel", assistant_model: "PreTrainedModel",
batch_size: int, batch_size: int,
max_cache_length: int, max_cache_length: int,
@ -2191,7 +2191,7 @@ class GenerationMixin(ContinuousMixin):
generation_config._pad_token_tensor = pad_token_tensor generation_config._pad_token_tensor = pad_token_tensor
generation_config._decoder_start_token_tensor = decoder_start_token_tensor generation_config._decoder_start_token_tensor = decoder_start_token_tensor
def _valid_auto_compile_criteria(self, model_kwargs: Dict, generation_config: GenerationConfig) -> bool: def _valid_auto_compile_criteria(self, model_kwargs: dict, generation_config: GenerationConfig) -> bool:
""" """
Determines whether to trigger auto-compilation of the model's forward pass at generation time. Determines whether to trigger auto-compilation of the model's forward pass at generation time.
""" """
@ -2239,7 +2239,7 @@ class GenerationMixin(ContinuousMixin):
generation_config: Optional[GenerationConfig] = None, generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
synced_gpus: Optional[bool] = None, synced_gpus: Optional[bool] = None,
assistant_model: Optional["PreTrainedModel"] = None, assistant_model: Optional["PreTrainedModel"] = None,
streamer: Optional["BaseStreamer"] = None, streamer: Optional["BaseStreamer"] = None,
@ -2287,7 +2287,7 @@ class GenerationMixin(ContinuousMixin):
generation config an error is thrown. If your stopping criteria depends on the `scores` input, make generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
intended for advanced users. intended for advanced users.
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
If provided, this function constraints the beam search to allowed tokens only at each step. If not If provided, this function constraints the beam search to allowed tokens only at each step. If not
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
@ -2321,7 +2321,7 @@ class GenerationMixin(ContinuousMixin):
function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
standard `generate` method. Note that the logic is for generation is entirely defined in that standard `generate` method. Note that the logic is for generation is entirely defined in that
repository, and the return type may be different from the standard `generate` method. repository, and the return type may be different from the standard `generate` method.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -2695,7 +2695,7 @@ class GenerationMixin(ContinuousMixin):
def typeerror(): def typeerror():
raise ValueError( raise ValueError(
"`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` " "`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]` "
f"of positive integers, but is {generation_config.force_words_ids}." f"of positive integers, but is {generation_config.force_words_ids}."
) )
@ -2871,7 +2871,7 @@ class GenerationMixin(ContinuousMixin):
def _dola_decoding( def _dola_decoding(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
dola_layers: Union[str, List[int]], dola_layers: Union[str, list[int]],
logits_processor: LogitsProcessorList, logits_processor: LogitsProcessorList,
stopping_criteria: StoppingCriteriaList, stopping_criteria: StoppingCriteriaList,
generation_config: GenerationConfig, generation_config: GenerationConfig,
@ -2888,7 +2888,7 @@ class GenerationMixin(ContinuousMixin):
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
dola_layers (`Union[str, List[int]]`): dola_layers (`Union[str, list[int]]`):
The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
to be used for candidate layers. The 0-th layer is the word embedding layer of the model. to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
@ -3806,7 +3806,7 @@ class GenerationMixin(ContinuousMixin):
num_beams: int, num_beams: int,
vocab_size: int, vocab_size: int,
batch_size: int, batch_size: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
""" """
Get top-K continuations given the accumulated log probs on the next token. Get top-K continuations given the accumulated log probs on the next token.
@ -3855,7 +3855,7 @@ class GenerationMixin(ContinuousMixin):
topk_running_beam_indices: torch.Tensor, topk_running_beam_indices: torch.Tensor,
next_token_hits_stopping_criteria: torch.Tensor, next_token_hits_stopping_criteria: torch.Tensor,
num_beams: int, num_beams: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
""" """
Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
best non-finished beams to continue beam search in the next iteration. best non-finished beams to continue beam search in the next iteration.
@ -3886,7 +3886,7 @@ class GenerationMixin(ContinuousMixin):
decoder_prompt_len: int, decoder_prompt_len: int,
length_penalty: float, length_penalty: float,
early_stopping: Union[bool, str], early_stopping: Union[bool, str],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
""" """
Updates the finished beams if (and only if) there are new completed sequences that have a higher score than Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
the current finished sequences. the current finished sequences.
@ -5236,8 +5236,8 @@ def _split(data, full_batch_size: int, split_size: int):
def _split_model_inputs( def _split_model_inputs(
model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int, config: PretrainedConfig model_input: Union[ModelOutput, dict], split_size: int, full_batch_size: int, config: PretrainedConfig
) -> List[Union[ModelOutput, Dict]]: ) -> list[Union[ModelOutput, dict]]:
""" """
Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
@ -5292,14 +5292,14 @@ def _split_model_inputs(
] ]
# Convert each dictionary in the list to an object of the inferred class # Convert each dictionary in the list to an object of the inferred class
split_model_inputs: List[Union[ModelOutput, Dict]] = [ split_model_inputs: list[Union[ModelOutput, dict]] = [
model_output_cls(**data_split, **bool_data) for data_split in data_split_list model_output_cls(**data_split, **bool_data) for data_split in data_split_list
] ]
return split_model_inputs return split_model_inputs
def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConfig) -> ModelOutput: def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConfig) -> ModelOutput:
""" """
Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
specific ModelOutput subclass from the list provided. specific ModelOutput subclass from the list provided.
@ -5379,8 +5379,8 @@ def _relative_top_filter(
def _dola_select_contrast( def _dola_select_contrast(
candidate_premature_layers: List[int], candidate_premature_layers: list[int],
candidate_premature_logits: Dict[int, torch.FloatTensor], candidate_premature_logits: dict[int, torch.FloatTensor],
final_logits: torch.FloatTensor, final_logits: torch.FloatTensor,
) -> torch.FloatTensor: ) -> torch.FloatTensor:
if len(candidate_premature_layers) == 1: if len(candidate_premature_layers) == 1:

View File

@ -16,7 +16,7 @@
import collections import collections
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from typing import Any, Dict, Optional, Tuple, Union from typing import Any, Optional, Union
import numpy as np import numpy as np
import torch import torch
@ -126,7 +126,7 @@ class WatermarkDetector:
self, self,
model_config: PretrainedConfig, model_config: PretrainedConfig,
device: str, device: str,
watermarking_config: Union[WatermarkingConfig, Dict], watermarking_config: Union[WatermarkingConfig, dict],
ignore_repeated_ngrams: bool = False, ignore_repeated_ngrams: bool = False,
max_cache_size: int = 128, max_cache_size: int = 128,
): ):
@ -300,7 +300,7 @@ class BayesianDetectorWatermarkedLikelihood(nn.Module):
self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth)) self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth))
self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth)) self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth))
def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: def _compute_latents(self, g_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Computes the unique token probability distribution given g-values. """Computes the unique token probability distribution given g-values.
Args: Args:

View File

@ -81,7 +81,7 @@ def HfArg(
``` ```
Args: Args:
aliases (Union[str, List[str]], optional): aliases (Union[str, list[str]], optional):
Single string or list of strings of aliases to pass on to argparse, e.g. `aliases=["--example", "-e"]`. Single string or list of strings of aliases to pass on to argparse, e.g. `aliases=["--example", "-e"]`.
Defaults to None. Defaults to None.
help (str, optional): Help string to pass on to argparse that can be displayed with --help. Defaults to None. help (str, optional): Help string to pass on to argparse that can be displayed with --help. Defaults to None.
@ -119,7 +119,7 @@ class HfArgumentParser(ArgumentParser):
Args: Args:
dataclass_types (`DataClassType` or `Iterable[DataClassType]`, *optional*): dataclass_types (`DataClassType` or `Iterable[DataClassType]`, *optional*):
Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args. Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Passed to `argparse.ArgumentParser()` in the regular way. Passed to `argparse.ArgumentParser()` in the regular way.
""" """

View File

@ -127,7 +127,7 @@ class ImageProcessingMixin(PushToHubMixin):
resume_download: resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible. Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers. Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*): proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*): token (`str` or `bool`, *optional*):
@ -153,7 +153,7 @@ class ImageProcessingMixin(PushToHubMixin):
subfolder (`str`, *optional*, defaults to `""`): subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here. specify the folder name here.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
The values in kwargs of any keys which are image processor attributes will be used to override the The values in kwargs of any keys which are image processor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
controlled by the `return_unused_kwargs` keyword parameter. controlled by the `return_unused_kwargs` keyword parameter.
@ -219,7 +219,7 @@ class ImageProcessingMixin(PushToHubMixin):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace). namespace).
kwargs (`Dict[str, Any]`, *optional*): kwargs (`dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
""" """
use_auth_token = kwargs.pop("use_auth_token", None) use_auth_token = kwargs.pop("use_auth_token", None)
@ -286,7 +286,7 @@ class ImageProcessingMixin(PushToHubMixin):
The name of the file in the model directory to use for the image processor config. The name of the file in the model directory to use for the image processor config.
Returns: Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object. `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
""" """
cache_dir = kwargs.pop("cache_dir", None) cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False) force_download = kwargs.pop("force_download", False)
@ -387,11 +387,11 @@ class ImageProcessingMixin(PushToHubMixin):
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters. Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
Args: Args:
image_processor_dict (`Dict[str, Any]`): image_processor_dict (`dict[str, Any]`):
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the retrieved from a pretrained checkpoint by leveraging the
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method. [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
kwargs (`Dict[str, Any]`): kwargs (`dict[str, Any]`):
Additional parameters from which to initialize the image processor object. Additional parameters from which to initialize the image processor object.
Returns: Returns:
@ -431,7 +431,7 @@ class ImageProcessingMixin(PushToHubMixin):
Serializes this instance to a Python dictionary. Serializes this instance to a Python dictionary.
Returns: Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance. `dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
""" """
output = copy.deepcopy(self.__dict__) output = copy.deepcopy(self.__dict__)
output["image_processor_type"] = self.__class__.__name__ output["image_processor_type"] = self.__class__.__name__

View File

@ -130,7 +130,7 @@ class BaseImageProcessor(ImageProcessingMixin):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to center crop. Image to center crop.
size (`Dict[str, int]`): size (`dict[str, int]`):
Size of the output image. Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input The channel dimension format for the output image. If unset, the channel dimension format of the input
@ -227,7 +227,7 @@ def get_size_dict(
is set, it is added to the dict as `{"longest_edge": max_size}`. is set, it is added to the dict as `{"longest_edge": max_size}`.
Args: Args:
size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*): size (`Union[int, Iterable[int], dict[str, int]]`, *optional*):
The `size` parameter to be cast into a size dictionary. The `size` parameter to be cast into a size dictionary.
max_size (`Optional[int]`, *optional*): max_size (`Optional[int]`, *optional*):
The `max_size` parameter to be cast into a size dictionary. The `max_size` parameter to be cast into a size dictionary.

View File

@ -382,7 +382,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
Args: Args:
image (`"torch.Tensor"`): image (`"torch.Tensor"`):
Image to center crop. Image to center crop.
size (`Dict[str, int]`): size (`dict[str, int]`):
Size of the output image. Size of the output image.
Returns: Returns:
@ -666,12 +666,12 @@ class SemanticSegmentationMixin:
Args: Args:
outputs ([`MobileNetV2ForSemanticSegmentation`]): outputs ([`MobileNetV2ForSemanticSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*): target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
predictions will not be resized. predictions will not be resized.
Returns: Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id. specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
""" """

View File

@ -217,7 +217,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in
Computes the output image size given the input image size and the desired output size. Computes the output image size given the input image size and the desired output size.
Args: Args:
image_size (`Tuple[int, int]`): image_size (`tuple[int, int]`):
The input image size. The input image size.
size (`int`): size (`int`):
The desired output size. The desired output size.
@ -266,7 +266,7 @@ def get_resize_output_image_size(
Args: Args:
input_image (`np.ndarray`): input_image (`np.ndarray`):
The image to resize. The image to resize.
size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`): size (`int` or `tuple[int, int]` or list[int] or `tuple[int]`):
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
this. this.
@ -334,7 +334,7 @@ def resize(
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
The image to resize. The image to resize.
size (`Tuple[int, int]`): size (`tuple[int, int]`):
The size to use for resizing the image. The size to use for resizing the image.
resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
The filter to user for resampling. The filter to user for resampling.
@ -464,7 +464,7 @@ def center_crop(
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
The image to crop. The image to crop.
size (`Tuple[int, int]`): size (`tuple[int, int]`):
The target size for the cropped image. The target size for the cropped image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. Can be one of: The channel dimension format for the output image. Can be one of:
@ -704,7 +704,7 @@ def pad(
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
The image to pad. The image to pad.
padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
Padding to apply to the edges of the height, width axes. Can be one of three formats: Padding to apply to the edges of the height, width axes. Can be one of three formats:
- `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
- `((before, after),)` yields same before and after pad for height and width. - `((before, after),)` yields same before and after pad for height and width.

View File

@ -218,7 +218,7 @@ def make_flat_list_of_images(
Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1. Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
If the input is a nested list of images, it is converted to a flat list of images. If the input is a nested list of images, it is converted to a flat list of images.
Args: Args:
images (`Union[List[ImageInput], ImageInput]`): images (`Union[list[ImageInput], ImageInput]`):
The input image. The input image.
Returns: Returns:
list: A list of images or a 4d array of images. list: A list of images or a 4d array of images.
@ -252,7 +252,7 @@ def make_nested_list_of_images(
""" """
Ensure that the output is a nested list of images. Ensure that the output is a nested list of images.
Args: Args:
images (`Union[List[ImageInput], ImageInput]`): images (`Union[list[ImageInput], ImageInput]`):
The input image. The input image.
Returns: Returns:
list: A list of list of images or a list of 4d array of images. list: A list of list of images or a list of 4d array of images.
@ -300,7 +300,7 @@ def infer_channel_dimension_format(
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
The image to infer the channel dimension of. The image to infer the channel dimension of.
num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`): num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
The number of channels of the image. The number of channels of the image.
Returns: Returns:
@ -393,7 +393,7 @@ def get_image_size_for_max_height_width(
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args: Args:
image_size (`Tuple[int, int]`): image_size (`tuple[int, int]`):
The image to resize. The image to resize.
max_height (`int`): max_height (`int`):
The maximum allowed height. The maximum allowed height.
@ -678,9 +678,9 @@ class ImageFeatureExtractionMixin:
Args: Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to normalize. The image to normalize.
mean (`List[float]` or `np.ndarray` or `torch.Tensor`): mean (`list[float]` or `np.ndarray` or `torch.Tensor`):
The mean (per channel) to use for normalization. The mean (per channel) to use for normalization.
std (`List[float]` or `np.ndarray` or `torch.Tensor`): std (`list[float]` or `np.ndarray` or `torch.Tensor`):
The standard deviation (per channel) to use for normalization. The standard deviation (per channel) to use for normalization.
rescale (`bool`, *optional*, defaults to `False`): rescale (`bool`, *optional*, defaults to `False`):
Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
@ -729,7 +729,7 @@ class ImageFeatureExtractionMixin:
Args: Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize. The image to resize.
size (`int` or `Tuple[int, int]`): size (`int` or `tuple[int, int]`):
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
matched to this. matched to this.
@ -797,7 +797,7 @@ class ImageFeatureExtractionMixin:
Args: Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)): image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
The image to resize. The image to resize.
size (`int` or `Tuple[int, int]`): size (`int` or `tuple[int, int]`):
The size to which crop the image. The size to which crop the image.
Returns: Returns:

View File

@ -156,7 +156,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
model (`torch.nn.Module`): The model to inspect. model (`torch.nn.Module`): The model to inspect.
Returns: Returns:
List[List[str]]: A list of lists of parameter names being all tied together. list[list[str]]: A list of lists of parameter names being all tied together.
Example: Example:

View File

@ -306,7 +306,7 @@ def _fuse_awq_layernorm(fuse_module_names, module, target_cls):
Fuse the LayerNorm layers into a target class using autoawq Fuse the LayerNorm layers into a target class using autoawq
Args: Args:
fuse_module_names (`List[str]`): fuse_module_names (`list[str]`):
The list of module names to fuse The list of module names to fuse
module (`nn.Module`): module (`nn.Module`):
The pytorch parent module that has layernorm modules to fuse The pytorch parent module that has layernorm modules to fuse
@ -333,7 +333,7 @@ def _fuse_awq_mlp(model, current_module_name, fuse_module_names, module, target_
The input pretrained model The input pretrained model
current_module_name (`str`): current_module_name (`str`):
The current submodule name The current submodule name
fuse_module_names (`List[str]`): fuse_module_names (`list[str]`):
The list of module names to fuse. For the MLP layers it has to be an array The list of module names to fuse. For the MLP layers it has to be an array
of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers) of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
module (`nn.Module`): module (`nn.Module`):
@ -374,7 +374,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
The input pretrained model The input pretrained model
module (`nn.Module`): module (`nn.Module`):
The pytorch parent module that has layernorm modules to fuse The pytorch parent module that has layernorm modules to fuse
modules_to_fuse (`List[str]`): modules_to_fuse (`list[str]`):
The module fusing mapping. The dictionary has to contain a field `attention` with attention module names The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
in the correct order: q, k, v, o layer in the correct order: q, k, v, o layer
current_module_name (`str`): current_module_name (`str`):

View File

@ -398,10 +398,10 @@ def replace_with_bitnet_linear(
Parameters: Parameters:
model (`torch.nn.Module`): model (`torch.nn.Module`):
Input model or `torch.nn.Module` as the function is run recursively. Input model or `torch.nn.Module` as the function is run recursively.
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
Names of the modules to not convert in `BitLinear`. In practice we keep the `lm_head` in full precision Names of the modules to not convert in `BitLinear`. In practice we keep the `lm_head` in full precision
for numerical stability reasons. for numerical stability reasons.
current_key_name (`List[`str`]`, *optional*): current_key_name (`list[`str`]`, *optional*):
An array to track the current key of the recursion. This is used to check whether the current key (part of An array to track the current key of the recursion. This is used to check whether the current key (part of
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
`disk`). `disk`).

View File

@ -243,10 +243,10 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
Parameters: Parameters:
model (`torch.nn.Module`): model (`torch.nn.Module`):
Input model or `torch.nn.Module` as the function is run recursively. Input model or `torch.nn.Module` as the function is run recursively.
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
for numerical stability reasons. for numerical stability reasons.
current_key_name (`List[`str`]`, *optional*): current_key_name (`list[`str`]`, *optional*):
An array to track the current key of the recursion. This is used to check whether the current key (part of An array to track the current key of the recursion. This is used to check whether the current key (part of
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
`disk`). `disk`).

View File

@ -93,10 +93,10 @@ def replace_with_eetq_linear(
Parameters: Parameters:
model (`torch.nn.Module`): model (`torch.nn.Module`):
Input model or `torch.nn.Module` as the function is run recursively. Input model or `torch.nn.Module` as the function is run recursively.
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision
for numerical stability reasons. for numerical stability reasons.
current_key_name (`List[`str`]`, *optional*): current_key_name (`list[`str`]`, *optional*):
An array to track the current key of the recursion. This is used to check whether the current key (part of An array to track the current key of the recursion. This is used to check whether the current key (part of
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
`disk`). `disk`).

View File

@ -251,10 +251,10 @@ def replace_with_fbgemm_fp8_linear(
Parameters: Parameters:
model (`torch.nn.Module`): model (`torch.nn.Module`):
Input model or `torch.nn.Module` as the function is run recursively. Input model or `torch.nn.Module` as the function is run recursively.
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision
for numerical stability reasons. for numerical stability reasons.
current_key_name (`List[`str`]`, *optional*): current_key_name (`list[`str`]`, *optional*):
An array to track the current key of the recursion. This is used to check whether the current key (part of An array to track the current key of the recursion. This is used to check whether the current key (part of
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
`disk`). `disk`).

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import List, Optional, Tuple from typing import Optional
from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging
@ -45,7 +45,7 @@ def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
tl.store(s_ptr + pid, s) tl.store(s_ptr + pid, s)
def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]: def act_quant(x: torch.Tensor, block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
assert x.is_contiguous() assert x.is_contiguous()
assert x.shape[-1] % block_size == 0 assert x.shape[-1] % block_size == 0
y = torch.empty_like(x, dtype=torch.float8_e4m3fn) y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
@ -149,7 +149,7 @@ def w8a8_block_fp8_matmul_triton(
B: torch.Tensor, B: torch.Tensor,
As: torch.Tensor, As: torch.Tensor,
Bs: torch.Tensor, Bs: torch.Tensor,
block_size: List[int], block_size: list[int],
output_dtype: torch.dtype = torch.float32, output_dtype: torch.dtype = torch.float32,
) -> torch.Tensor: ) -> torch.Tensor:
"""This function performs matrix multiplication with block-wise """This function performs matrix multiplication with block-wise
@ -231,7 +231,7 @@ def w8a8_block_fp8_matmul_compile(
weight_q: torch.Tensor, # [out_features, hidden_dim] weight_q: torch.Tensor, # [out_features, hidden_dim]
input_scale: torch.Tensor, # [batch * seq_len, num_input_groups] input_scale: torch.Tensor, # [batch * seq_len, num_input_groups]
weight_scale: torch.Tensor, # [num_weight_blocks_m, num_weight_blocks_n] weight_scale: torch.Tensor, # [num_weight_blocks_m, num_weight_blocks_n]
block_size: Optional[Tuple[int, int]] = None, # (M=128, N=128) for weights for example block_size: Optional[tuple[int, int]] = None, # (M=128, N=128) for weights for example
output_dtype: torch.dtype = torch.float32, output_dtype: torch.dtype = torch.float32,
) -> torch.Tensor: ) -> torch.Tensor:
""" """
@ -300,7 +300,7 @@ class FP8Linear(nn.Linear):
out_features: int, out_features: int,
bias: bool = False, bias: bool = False,
dtype=None, dtype=None,
block_size: Optional[Tuple[int, int]] = None, block_size: Optional[tuple[int, int]] = None,
device=None, device=None,
activation_scheme="dynamic", activation_scheme="dynamic",
): ):

View File

@ -1,4 +1,4 @@
from typing import Optional, Tuple from typing import Optional
import torch import torch
@ -22,7 +22,7 @@ def flash_attention_forward(
sliding_window: Optional[int] = None, sliding_window: Optional[int] = None,
softcap: Optional[float] = None, softcap: Optional[float] = None,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor, None]: ) -> tuple[torch.Tensor, None]:
if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None: if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None:
logger.warning_once( logger.warning_once(
"`flash_attention_2` does not support `output_attentions=True` or `head_mask`." "`flash_attention_2` does not support `output_attentions=True` or `head_mask`."

View File

@ -26,7 +26,7 @@ Citation:
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import Optional, Tuple, Union from typing import Optional, Union
import torch import torch
from packaging import version from packaging import version
@ -106,7 +106,7 @@ def make_flex_block_causal_mask(
attention_chunk_size: Optional[int] = None, attention_chunk_size: Optional[int] = None,
query_length=None, query_length=None,
key_length=None, key_length=None,
offsets: Optional[Tuple[Offset, Offset]] = None, offsets: Optional[tuple[Offset, Offset]] = None,
is_causal: Optional[bool] = True, is_causal: Optional[bool] = True,
) -> "BlockMask": ) -> "BlockMask":
""" """
@ -234,7 +234,7 @@ def flex_attention_forward(
softcap: Optional[float] = None, softcap: Optional[float] = None,
head_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if head_mask is not None: if head_mask is not None:
logger.warning_once( logger.warning_once(
"`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature." "`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature."

View File

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import Dict, Union from typing import Union
from ..utils import is_torchdynamo_compiling from ..utils import is_torchdynamo_compiling
@ -29,7 +29,7 @@ try:
_hub_kernels_available = True _hub_kernels_available = True
_KERNEL_MAPPING: Dict[str, Dict[Union[Device, str], LayerRepository]] = { _KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
"MultiScaleDeformableAttention": { "MultiScaleDeformableAttention": {
"cuda": LayerRepository( "cuda": LayerRepository(
repo_id="kernels-community/deformable-detr", repo_id="kernels-community/deformable-detr",

View File

@ -29,7 +29,7 @@ import tempfile
from dataclasses import asdict, fields from dataclasses import asdict, fields
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union from typing import TYPE_CHECKING, Any, Literal, Optional, Union
import numpy as np import numpy as np
import packaging.version import packaging.version
@ -1692,7 +1692,7 @@ class NeptuneCallback(TrainerCallback):
raise Exception("The trainer doesn't have a NeptuneCallback configured.") raise Exception("The trainer doesn't have a NeptuneCallback configured.")
def on_log(self, args, state, control, logs: Optional[Dict[str, float]] = None, **kwargs): def on_log(self, args, state, control, logs: Optional[dict[str, float]] = None, **kwargs):
if not state.is_world_process_zero: if not state.is_world_process_zero:
return return

View File

@ -16,7 +16,7 @@ import importlib
import inspect import inspect
import re import re
import warnings import warnings
from typing import Any, Dict, List, Optional, Union from typing import Any, Optional, Union
from packaging import version from packaging import version
@ -100,11 +100,11 @@ class PeftAdapterMixin:
max_memory: Optional[str] = None, max_memory: Optional[str] = None,
offload_folder: Optional[str] = None, offload_folder: Optional[str] = None,
offload_index: Optional[int] = None, offload_index: Optional[int] = None,
peft_config: Optional[Dict[str, Any]] = None, peft_config: Optional[dict[str, Any]] = None,
adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None, adapter_state_dict: Optional[dict[str, "torch.Tensor"]] = None,
low_cpu_mem_usage: bool = False, low_cpu_mem_usage: bool = False,
is_trainable: bool = False, is_trainable: bool = False,
adapter_kwargs: Optional[Dict[str, Any]] = None, adapter_kwargs: Optional[dict[str, Any]] = None,
) -> None: ) -> None:
""" """
Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
@ -133,7 +133,7 @@ class PeftAdapterMixin:
Whether to use authentication token to load the remote folder. Useful to load private repositories Whether to use authentication token to load the remote folder. Useful to load private repositories
that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to
cache it. cache it.
device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
A map that specifies where each submodule should go. It doesn't need to be refined to each A map that specifies where each submodule should go. It doesn't need to be refined to each
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
@ -150,10 +150,10 @@ class PeftAdapterMixin:
If the `device_map` contains any value `"disk"`, the folder where we will offload weights. If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
offload_index (`int`, `optional`): offload_index (`int`, `optional`):
`offload_index` argument to be passed to `accelerate.dispatch_model` method. `offload_index` argument to be passed to `accelerate.dispatch_model` method.
peft_config (`Dict[str, Any]`, *optional*): peft_config (`dict[str, Any]`, *optional*):
The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts
methods. This argument is used in case users directly pass PEFT state dicts methods. This argument is used in case users directly pass PEFT state dicts
adapter_state_dict (`Dict[str, torch.Tensor]`, *optional*): adapter_state_dict (`dict[str, torch.Tensor]`, *optional*):
The state dict of the adapter to load. This argument is used in case users directly pass PEFT state The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
dicts dicts
low_cpu_mem_usage (`bool`, *optional*, defaults to `False`): low_cpu_mem_usage (`bool`, *optional*, defaults to `False`):
@ -162,7 +162,7 @@ class PeftAdapterMixin:
is_trainable (`bool`, *optional*, defaults to `False`): is_trainable (`bool`, *optional*, defaults to `False`):
Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
used for inference. used for inference.
adapter_kwargs (`Dict[str, Any]`, *optional*): adapter_kwargs (`dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
`find_adapter_config_file` method. `find_adapter_config_file` method.
""" """
@ -348,7 +348,7 @@ class PeftAdapterMixin:
self.set_adapter(adapter_name) self.set_adapter(adapter_name)
def set_adapter(self, adapter_name: Union[List[str], str]) -> None: def set_adapter(self, adapter_name: Union[list[str], str]) -> None:
""" """
If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
official documentation: https://huggingface.co/docs/peft official documentation: https://huggingface.co/docs/peft
@ -356,7 +356,7 @@ class PeftAdapterMixin:
Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters. Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
Args: Args:
adapter_name (`Union[List[str], str]`): adapter_name (`Union[list[str], str]`):
The name of the adapter to set. Can be also a list of strings to set multiple adapters. The name of the adapter to set. Can be also a list of strings to set multiple adapters.
""" """
check_peft_version(min_version=MIN_PEFT_VERSION) check_peft_version(min_version=MIN_PEFT_VERSION)
@ -438,7 +438,7 @@ class PeftAdapterMixin:
else: else:
module.disable_adapters = False module.disable_adapters = False
def active_adapters(self) -> List[str]: def active_adapters(self) -> list[str]:
""" """
If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
official documentation: https://huggingface.co/docs/peft official documentation: https://huggingface.co/docs/peft
@ -518,7 +518,7 @@ class PeftAdapterMixin:
accelerate (i.e. with `device_map=xxx`) accelerate (i.e. with `device_map=xxx`)
Args: Args:
device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
A map that specifies where each submodule should go. It doesn't need to be refined to each A map that specifies where each submodule should go. It doesn't need to be refined to each
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
@ -562,12 +562,12 @@ class PeftAdapterMixin:
**dispatch_model_kwargs, **dispatch_model_kwargs,
) )
def delete_adapter(self, adapter_names: Union[List[str], str]) -> None: def delete_adapter(self, adapter_names: Union[list[str], str]) -> None:
""" """
Delete an adapter's LoRA layers from the underlying model. Delete an adapter's LoRA layers from the underlying model.
Args: Args:
adapter_names (`Union[List[str], str]`): adapter_names (`Union[list[str], str]`):
The name(s) of the adapter(s) to delete. The name(s) of the adapter(s) to delete.
Example: Example:

Some files were not shown because too many files have changed in this diff Show More