mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-02 04:10:06 +06:00
No more Tuple, List, Dict (#38797)
* No more Tuple, List, Dict * make fixup * More style fixes * Docstring fixes with regex replacement * Trigger tests * Redo fixes after rebase * Fix copies * [test all] * update * [test all] * update * [test all] * make style after rebase * Patch the hf_argparser test * Patch the hf_argparser test * style fixes * style fixes * style fixes * Fix docstrings in Cohere test * [test all] --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
a396f4324b
commit
508a704055
@ -28,7 +28,7 @@ class MetricsRecorder:
|
||||
self.commit_id = commit_id
|
||||
self.commit_msg = commit_msg
|
||||
|
||||
def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
|
||||
def initialise_benchmark(self, metadata: dict[str, str]) -> int:
|
||||
"""
|
||||
Creates a new benchmark, returns the benchmark id
|
||||
"""
|
||||
@ -55,7 +55,7 @@ class MetricsRecorder:
|
||||
f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
||||
)
|
||||
|
||||
def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
|
||||
def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
@ -85,7 +85,7 @@ handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def parse_arguments() -> Tuple[str, str, str, str]:
|
||||
def parse_arguments() -> tuple[str, str, str, str]:
|
||||
"""
|
||||
Parse command line arguments for the benchmarking CLI.
|
||||
"""
|
||||
|
@ -278,7 +278,7 @@ Here's an example of a single value return:
|
||||
|
||||
```python
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
|
||||
`list[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
|
||||
```
|
||||
|
||||
Here's an example of a tuple return, comprising several objects:
|
||||
|
@ -30,7 +30,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -571,7 +571,7 @@ The processor should call the appropriate modality-specific processors within it
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
audio=None,
|
||||
videos=None,
|
||||
**kwargs: Unpack[YourModelProcessorKwargs],
|
||||
|
@ -92,7 +92,7 @@ def custom_attention(
|
||||
a_new_kwargs = None, # You can now add as many kwargs as you need
|
||||
another_new_kwargs = None, # You can now add as many kwargs as you need
|
||||
**kwargs, # You need to accept **kwargs as models will pass other args
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]
|
||||
... # do your magic!
|
||||
return attn_output, attn_weights # attn_weights are optional here
|
||||
|
||||
|
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -152,7 +152,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
|
||||
| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
|
||||
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
|
||||
| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
|
||||
| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
|
||||
|
||||
|
||||
## Pitfalls
|
||||
|
@ -62,11 +62,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
|
||||
|
||||
box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
|
||||
|
||||
# encode(tokenize) each word from words (List[str])
|
||||
input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
|
||||
# encode(tokenize) each word from words (list[str])
|
||||
input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
|
||||
|
||||
# get the length of each box
|
||||
tokens_length_list: List[int] = [len(l) for l in input_ids_list]
|
||||
tokens_length_list: list[int] = [len(l) for l in input_ids_list]
|
||||
|
||||
box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
|
||||
box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
|
||||
|
@ -149,7 +149,7 @@ As a summary, consider the following table:
|
||||
| **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
|
||||
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
|
||||
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
|
||||
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
|
||||
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `list[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
|
||||
| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
|
||||
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
|
||||
|
||||
|
@ -83,7 +83,7 @@ def read_video_pyav(container, indices):
|
||||
Decode the video with PyAV decoder.
|
||||
Args:
|
||||
container (`av.container.input.InputContainer`): PyAV container.
|
||||
indices (`List[int]`): List of frame indices to decode.
|
||||
indices (`list[int]`): List of frame indices to decode.
|
||||
Returns:
|
||||
result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
|
||||
'''
|
||||
|
@ -216,12 +216,12 @@ class Olmo2Attention(OlmoAttention):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
@ -294,9 +294,9 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs,
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
|
||||
# Self Attention
|
||||
@ -494,7 +494,7 @@ class LlamaForCausalLM(nn.Module):
|
||||
input_ids: torch.LongTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
@ -520,7 +520,7 @@ class NewModelForCausalLM(LlamaForCausalLM): | class LlamaForCausalLM(nn.M
|
||||
| input_ids: torch.LongTensor = None,
|
||||
| attention_mask: Optional[torch.Tensor] = None,
|
||||
| position_ids: Optional[torch.LongTensor] = None,
|
||||
| past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None,
|
||||
| past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = |None,
|
||||
| inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
| labels: Optional[torch.LongTensor] = None,
|
||||
| use_cache: Optional[bool] = None,
|
||||
|
@ -170,7 +170,7 @@ Unlike other data collators, this specific data collator needs to apply a differ
|
||||
... processor: AutoProcessor
|
||||
... padding: Union[bool, str] = "longest"
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... # split inputs and labels since they have to be of different lengths and need
|
||||
... # different padding methods
|
||||
... input_features = [{"input_values": feature["input_values"][0]} for feature in features]
|
||||
|
@ -243,7 +243,7 @@ and it uses the exact same dataset as an example. Apply some geometric and color
|
||||
... )
|
||||
```
|
||||
|
||||
The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`,
|
||||
The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': list[Dict]}`,
|
||||
where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
|
||||
|
||||
```py
|
||||
@ -252,9 +252,9 @@ The `image_processor` expects the annotations to be in the following format: `{'
|
||||
|
||||
... Args:
|
||||
... image_id (str): image id. e.g. "0001"
|
||||
... categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
... areas (List[float]): list of corresponding areas to provided bounding boxes
|
||||
... bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
|
||||
... categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
... areas (list[float]): list of corresponding areas to provided bounding boxes
|
||||
... bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
|
||||
... ([center_x, center_y, width, height] in absolute coordinates)
|
||||
|
||||
... Returns:
|
||||
@ -397,7 +397,7 @@ Intermediate format of boxes used for training is `YOLO` (normalized) but we wil
|
||||
|
||||
... Args:
|
||||
... boxes (torch.Tensor): Bounding boxes in YOLO format
|
||||
... image_size (Tuple[int, int]): Image size in format (height, width)
|
||||
... image_size (tuple[int, int]): Image size in format (height, width)
|
||||
|
||||
... Returns:
|
||||
... torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
|
||||
|
@ -408,7 +408,7 @@ instructs the model to ignore that part of the spectrogram when calculating the
|
||||
... class TTSDataCollatorWithPadding:
|
||||
... processor: Any
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
||||
... label_features = [{"input_values": feature["labels"]} for feature in features]
|
||||
... speaker_features = [feature["speaker_embeddings"] for feature in features]
|
||||
|
@ -48,7 +48,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -166,7 +166,7 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de
|
||||
... processor: AutoProcessor
|
||||
... padding: Union[bool, str] = "longest"
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
|
||||
... # requieren métodos de padding diferentes
|
||||
... input_features = [{"input_values": feature["input_values"][0]} for feature in features]
|
||||
|
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -56,7 +56,7 @@ Optunaに関しては、[object_parameter](https://optuna.readthedocs.io/en/stab
|
||||
... }
|
||||
```
|
||||
|
||||
Optunaは、多目的のハイパーパラメータ最適化(HPO)を提供しています。 `hyperparameter_search` で `direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front(`List[BestRun]`)は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。
|
||||
Optunaは、多目的のハイパーパラメータ最適化(HPO)を提供しています。 `hyperparameter_search` で `direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front(`list[BestRun]`)は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。
|
||||
|
||||
|
||||
```py
|
||||
|
@ -57,11 +57,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
|
||||
|
||||
box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
|
||||
|
||||
# encode(tokenize) each word from words (List[str])
|
||||
input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
|
||||
# encode(tokenize) each word from words (list[str])
|
||||
input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
|
||||
|
||||
# get the length of each box
|
||||
tokens_length_list: List[int] = [len(l) for l in input_ids_list]
|
||||
tokens_length_list: list[int] = [len(l) for l in input_ids_list]
|
||||
|
||||
box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
|
||||
box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
|
||||
|
@ -149,7 +149,7 @@ DETR モデルをインスタンス化するには 3 つの方法があります
|
||||
| **Description** |画像内のオブジェクトの周囲の境界ボックスとクラス ラベルを予測する | 画像内のオブジェクト (つまりインスタンス) の周囲のマスクを予測する | 画像内のオブジェクト (インスタンス) と「もの」 (木や道路などの背景) の両方の周囲のマスクを予測します |
|
||||
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
|
||||
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
|
||||
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
|
||||
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `list[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
|
||||
| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
|
||||
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
|
||||
|
||||
|
@ -170,7 +170,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
|
||||
... processor: AutoProcessor
|
||||
... padding: Union[bool, str] = "longest"
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... # split inputs and labels since they have to be of different lengths and need
|
||||
... # different padding methods
|
||||
... input_features = [{"input_values": feature["input_values"][0]} for feature in features]
|
||||
|
@ -208,7 +208,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
|
||||
... )
|
||||
```
|
||||
|
||||
`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': List[Dict]}`,
|
||||
`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': list[Dict]}`,
|
||||
ここで、各辞書は COCO オブジェクトの注釈です。 1 つの例として、注釈を再フォーマットする関数を追加してみましょう。
|
||||
|
||||
```py
|
||||
|
@ -408,7 +408,7 @@ Y 軸が反転され、スペクトログラムが上下逆に表示されます
|
||||
... class TTSDataCollatorWithPadding:
|
||||
... processor: Any
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
||||
... label_features = [{"input_values": feature["labels"]} for feature in features]
|
||||
... speaker_features = [feature["speaker_embeddings"] for feature in features]
|
||||
|
@ -46,7 +46,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -172,7 +172,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
|
||||
... processor: AutoProcessor
|
||||
... padding: Union[bool, str] = "longest"
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... # 입력과 레이블을 분할합니다
|
||||
... # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다
|
||||
... input_features = [{"input_values": feature["input_values"][0]} for feature in features]
|
||||
|
@ -201,7 +201,7 @@ DatasetDict({
|
||||
... )
|
||||
```
|
||||
|
||||
이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': List[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
|
||||
이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': list[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
|
||||
|
||||
```py
|
||||
>>> def formatted_anns(image_id, category, area, bbox):
|
||||
|
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
block_type="bottleneck",
|
||||
layers: List[int] = [3, 4, 6, 3],
|
||||
layers: list[int] = [3, 4, 6, 3],
|
||||
num_classes: int = 1000,
|
||||
input_channels: int = 3,
|
||||
cardinality: int = 1,
|
||||
|
@ -56,7 +56,7 @@ pip install optuna/sigopt/wandb/ray[tune]
|
||||
... }
|
||||
```
|
||||
|
||||
Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数,并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front(`List[BestRun]`),您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容:
|
||||
Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数,并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front(`list[BestRun]`),您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容:
|
||||
|
||||
```py
|
||||
>>> best_trials = trainer.hyperparameter_search(
|
||||
|
@ -181,7 +181,7 @@ Wav2Vec2 分词器仅训练了大写字符,因此您需要确保文本与分
|
||||
... processor: AutoProcessor
|
||||
... padding: Union[bool, str] = "longest"
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
|
||||
... # split inputs and labels since they have to be of different lengths and need
|
||||
... # different padding methods
|
||||
... input_features = [{"input_values": feature["input_values"][0]} for feature in features]
|
||||
|
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
|
@ -184,7 +184,7 @@ class Seq2SeqTrainer(Trainer):
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
inputs (:obj:`dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
@ -193,7 +193,7 @@ class Seq2SeqTrainer(Trainer):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
A tuple with the loss, logits and labels (each being optional).
|
||||
"""
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
|
@ -530,7 +530,7 @@ def calculate_rouge(
|
||||
on multi sentence summaries (CNN/DM dataset).
|
||||
|
||||
Returns:
|
||||
Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
|
||||
dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
|
||||
|
||||
"""
|
||||
scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
|
||||
|
@ -91,11 +91,11 @@ class MyNewModelConfig(PretrainedConfig):
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`List[float]`, *optional*):
|
||||
`short_factor` (`list[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`List[float]`, *optional*):
|
||||
`long_factor` (`list[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_new_imgproc_model.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Dict, List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -57,11 +57,11 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
|
||||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
|
||||
overridden by the `image_mean` parameter in the `preprocess` method.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
@ -74,13 +74,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@ -101,7 +101,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
def resize(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
size: Dict[str, int],
|
||||
size: dict[str, int],
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
@ -113,7 +113,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
size (`dict[str, int]`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
||||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
|
||||
@ -151,13 +151,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
self,
|
||||
images: ImageInput,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
image_mean: Optional[Union[float, list[float]]] = None,
|
||||
image_std: Optional[Union[float, list[float]]] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
data_format: ChannelDimension = ChannelDimension.FIRST,
|
||||
@ -172,7 +172,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Controls the size of the image after `resize`. The shortest edge of the image is resized to
|
||||
`size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
|
||||
is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
|
||||
@ -185,9 +185,9 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to normalize the image by if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
|
@ -5,7 +5,7 @@
|
||||
# modular_add_function.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# Note that zamba does not have the `apply_rotary_pos_emb` function!
|
||||
from typing import Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -62,5 +62,5 @@ class TestAttention(nn.Module):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def forward(self) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
def forward(self) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
_ = apply_rotary_pos_emb(1, 1, 1, 1)
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_dummy.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -210,12 +210,12 @@ class DummyAttention(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
@ -278,9 +278,9 @@ class DummyDecoderLayer(GradientCheckpointingLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
import math
|
||||
import os
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
@ -136,9 +136,9 @@ class DummyBertSelfAttention(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
|
||||
# If this is instantiated as a cross-attention module, the keys
|
||||
@ -245,9 +245,9 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
|
||||
# TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
|
||||
logger.warning_once(
|
||||
@ -386,9 +386,9 @@ class DummyBertAttention(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
@ -454,9 +454,9 @@ class DummyBertLayer(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
||||
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
|
||||
self_attention_outputs = self.attention(
|
||||
@ -532,12 +532,12 @@ class DummyBertEncoder(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
output_hidden_states: Optional[bool] = False,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_from_uppercase_model.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -71,7 +71,7 @@ class FromUppercaseModelAttention(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
causal_attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
batch_size, seq_length, embed_dim = hidden_states.shape
|
||||
@ -153,7 +153,7 @@ class FromUppercaseModelEncoderLayer(nn.Module):
|
||||
attention_mask: torch.Tensor,
|
||||
causal_attention_mask: torch.Tensor,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.FloatTensor]:
|
||||
) -> tuple[torch.FloatTensor]:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_multimodal1.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -210,12 +210,12 @@ class Multimodal1TextAttention(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
@ -278,9 +278,9 @@ class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
# modular_multimodal2.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -81,7 +81,7 @@ class Multimodal2VisionAttention(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
causal_attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
batch_size, seq_length, embed_dim = hidden_states.shape
|
||||
@ -177,7 +177,7 @@ class Multimodal2Attention(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
causal_attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
batch_size, seq_length, embed_dim = hidden_states.shape
|
||||
@ -244,7 +244,7 @@ class Multimodal2VisionEncoderLayer(nn.Module):
|
||||
attention_mask: torch.Tensor,
|
||||
causal_attention_mask: torch.Tensor,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.FloatTensor]:
|
||||
) -> tuple[torch.FloatTensor]:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_my_new_model2.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, List, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -208,12 +208,12 @@ class MyNewModel2Attention(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
@ -276,9 +276,9 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
@ -469,7 +469,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
|
@ -5,7 +5,7 @@
|
||||
# modular_new_task_model.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from dataclasses import dataclass
|
||||
from typing import ClassVar, List, Optional, Tuple, Union
|
||||
from typing import ClassVar, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -88,9 +88,9 @@ class NewTaskModelCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@ -249,7 +249,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
|
||||
pixel_values: torch.FloatTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
|
||||
past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
|
||||
token_type_ids: Optional[torch.LongTensor] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
@ -259,7 +259,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Union[Tuple, NewTaskModelModelOutputWithPast]:
|
||||
) -> Union[tuple, NewTaskModelModelOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
@ -442,7 +442,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
num_logits_to_keep: int = 0,
|
||||
) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
|
||||
) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
|
@ -6,7 +6,7 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
import math
|
||||
import os
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -139,9 +139,9 @@ class RobertaSelfAttention(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
|
||||
# If this is instantiated as a cross-attention module, the keys
|
||||
@ -248,9 +248,9 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
|
||||
# TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
|
||||
logger.warning_once(
|
||||
@ -389,9 +389,9 @@ class RobertaAttention(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
@ -457,9 +457,9 @@ class RobertaLayer(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
||||
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
|
||||
self_attention_outputs = self.attention(
|
||||
@ -535,12 +535,12 @@ class RobertaEncoder(nn.Module):
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
output_hidden_states: Optional[bool] = False,
|
||||
return_dict: Optional[bool] = True,
|
||||
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
|
||||
@ -903,12 +903,12 @@ class RobertaModel(RobertaPreTrainedModel):
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||
r"""
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
||||
|
@ -4,7 +4,7 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_super.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -211,12 +211,12 @@ class SuperAttention(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
@ -279,9 +279,9 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
# modular_switch_function.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# Note that llama and cohere have different definitions for rotate_half
|
||||
from typing import Callable, Optional, Tuple
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -123,12 +123,12 @@ class SwitchFunctionAttention(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: Tuple[torch.Tensor, torch.Tensor],
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
import math
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -43,7 +43,7 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
self,
|
||||
value: Tensor,
|
||||
value_spatial_shapes: Tensor,
|
||||
value_spatial_shapes_list: List[Tuple],
|
||||
value_spatial_shapes_list: list[tuple],
|
||||
level_start_index: Tensor,
|
||||
sampling_locations: Tensor,
|
||||
attention_weights: Tensor,
|
||||
@ -124,9 +124,9 @@ class TestDetrDecoderOutput(ModelOutput):
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
intermediate_reference_points: Optional[torch.FloatTensor] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
cross_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -177,12 +177,12 @@ class TestDetrModelOutput(ModelOutput):
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
intermediate_hidden_states: Optional[torch.FloatTensor] = None
|
||||
intermediate_reference_points: Optional[torch.FloatTensor] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
cross_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
enc_outputs_class: Optional[torch.FloatTensor] = None
|
||||
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
|
||||
|
||||
@ -557,7 +557,7 @@ class TestDetrMultiheadAttention(nn.Module):
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_embeddings: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
batch_size, target_len, embed_dim = hidden_states.size()
|
||||
@ -1431,7 +1431,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
||||
Args:
|
||||
enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
|
||||
padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
|
||||
spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps.
|
||||
spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
|
||||
|
||||
Returns:
|
||||
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
|
||||
@ -1499,7 +1499,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.FloatTensor], TestDetrModelOutput]:
|
||||
) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
|
@ -33,7 +33,7 @@ import logging
|
||||
import os
|
||||
from collections.abc import Iterable
|
||||
from contextlib import nullcontext
|
||||
from typing import Dict, Optional
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -589,7 +589,7 @@ class ContextParallelCollator:
|
||||
def __init__(self, cp_mesh: Optional[DeviceMesh] = None):
|
||||
self.cp_mesh = cp_mesh
|
||||
|
||||
def __call__(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||
def __call__(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
|
||||
batch = default_collate(batch)
|
||||
if self.cp_mesh is not None and self.cp_mesh.size() > 1:
|
||||
# Get sequence length from the input batch
|
||||
|
@ -66,9 +66,9 @@ def format_image_annotations_as_coco(
|
||||
|
||||
Args:
|
||||
image_id (str): image id. e.g. "0001"
|
||||
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
areas (List[float]): list of corresponding areas to provided bounding boxes
|
||||
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
|
||||
categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
areas (list[float]): list of corresponding areas to provided bounding boxes
|
||||
bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
|
||||
([center_x, center_y, width, height] in absolute coordinates)
|
||||
|
||||
Returns:
|
||||
@ -101,7 +101,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
|
||||
|
||||
Args:
|
||||
boxes (torch.Tensor): Bounding boxes in YOLO format
|
||||
image_size (Tuple[int, int]): Image size in format (height, width)
|
||||
image_size (tuple[int, int]): Image size in format (height, width)
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
|
||||
|
@ -67,9 +67,9 @@ def format_image_annotations_as_coco(
|
||||
|
||||
Args:
|
||||
image_id (str): image id. e.g. "0001"
|
||||
categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
areas (List[float]): list of corresponding areas to provided bounding boxes
|
||||
bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
|
||||
categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
|
||||
areas (list[float]): list of corresponding areas to provided bounding boxes
|
||||
bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
|
||||
([center_x, center_y, width, height] in absolute coordinates)
|
||||
|
||||
Returns:
|
||||
@ -103,7 +103,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
|
||||
|
||||
Args:
|
||||
boxes (torch.Tensor): Bounding boxes in YOLO format
|
||||
image_size (Tuple[int, int]): Image size in format (height, width)
|
||||
image_size (tuple[int, int]): Image size in format (height, width)
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
|
||||
|
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
|
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
|
||||
Args:
|
||||
examples: The non-preprocessed dataset (see the main script for more information).
|
||||
features: The processed dataset (see the main script for more information).
|
||||
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
|
||||
predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
|
||||
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
|
||||
first dimension must match the number of elements of :obj:`features`.
|
||||
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
|
@ -22,7 +22,8 @@ line-length = 119
|
||||
ignore = ["C901", "E501", "E741", "F402", "F823" ]
|
||||
# RUF013: Checks for the use of implicit Optional
|
||||
# in type annotations when the default parameter value is None.
|
||||
select = ["C", "E", "F", "I", "W", "RUF013"]
|
||||
select = ["C", "E", "F", "I", "W", "RUF013", "UP006"]
|
||||
extend-safe-fixes = ["UP006"]
|
||||
|
||||
# Ignore import violations in all `__init__.py` files.
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
|
@ -19,7 +19,7 @@ and remove unnecessary dependencies.
|
||||
import os
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@ -70,7 +70,7 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
|
||||
|
||||
|
||||
AudioInput = Union[
|
||||
np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"] # noqa: F821
|
||||
np.ndarray, "torch.Tensor", list[np.ndarray], tuple[np.ndarray], list["torch.Tensor"], tuple["torch.Tensor"] # noqa: F821
|
||||
]
|
||||
|
||||
|
||||
@ -88,7 +88,7 @@ def make_list_of_audio(
|
||||
"""
|
||||
Ensure that the output is a list of audio.
|
||||
Args:
|
||||
audio (`Union[List[AudioInput], AudioInput]`):
|
||||
audio (`Union[list[AudioInput], AudioInput]`):
|
||||
The input audio.
|
||||
Returns:
|
||||
list: A list of audio.
|
||||
@ -246,7 +246,7 @@ def chroma_filter_bank(
|
||||
Tuning deviation from A440 in fractions of a chroma bin.
|
||||
power (`float`, *optional*, defaults to 2.0):
|
||||
If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
|
||||
weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
|
||||
weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
|
||||
If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
|
||||
the second element being the Gaussian half-width.
|
||||
start_at_c_chroma (`float`, *optional*, defaults to `True`):
|
||||
@ -733,7 +733,7 @@ def spectrogram_batch(
|
||||
Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
|
||||
|
||||
Args:
|
||||
waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`):
|
||||
waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
|
||||
The list of input waveforms, each a single-channel (mono) signal.
|
||||
window (`np.ndarray` of shape `(frame_length,)`):
|
||||
The windowing function to apply, including zero-padding if necessary.
|
||||
@ -775,7 +775,7 @@ def spectrogram_batch(
|
||||
Data type of the output spectrogram.
|
||||
|
||||
Returns:
|
||||
List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
|
||||
list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
|
||||
"""
|
||||
window_length = len(window)
|
||||
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
import os
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
@ -28,7 +28,7 @@ def _static_cache_update(
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
cache_position: Optional[torch.LongTensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the static cache tensors in place.
|
||||
|
||||
@ -41,7 +41,7 @@ def _static_cache_update(
|
||||
If None, the entire cache is overwritten (prefill).
|
||||
|
||||
Returns:
|
||||
Tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place).
|
||||
tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place).
|
||||
"""
|
||||
if cache_position is None:
|
||||
# Prefill phase where seq_len potentially equals max_cache_len. Directly copy.
|
||||
@ -67,7 +67,7 @@ def _sliding_cache_update(
|
||||
value_states: torch.Tensor,
|
||||
cache_position: torch.LongTensor,
|
||||
max_cache_len: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the sliding window cache tensors, returning the potentially modified tensors.
|
||||
|
||||
@ -80,7 +80,7 @@ def _sliding_cache_update(
|
||||
max_cache_len (`int`): The maximum length of the sliding window cache.
|
||||
|
||||
Returns:
|
||||
Tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update.
|
||||
tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update.
|
||||
For prefill > window, these are the full input states.
|
||||
Otherwise, they are the updated cache tensors.
|
||||
"""
|
||||
@ -134,8 +134,8 @@ class Cache:
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
|
||||
|
||||
@ -146,7 +146,7 @@ class Cache:
|
||||
The new value states to cache.
|
||||
layer_idx (`int`):
|
||||
The index of the layer to cache the states for.
|
||||
cache_kwargs (`Dict[str, Any]`, `optional`):
|
||||
cache_kwargs (`dict[str, Any]`, `optional`):
|
||||
Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
|
||||
cache to be created.
|
||||
|
||||
@ -222,7 +222,7 @@ class CacheConfig:
|
||||
"""
|
||||
Constructs a CacheConfig instance from a dictionary of parameters.
|
||||
Args:
|
||||
config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
|
||||
config_dict (dict[str, Any]): Dictionary containing configuration parameters.
|
||||
**kwargs: Additional keyword arguments to override dictionary values.
|
||||
|
||||
Returns:
|
||||
@ -257,10 +257,10 @@ class CacheConfig:
|
||||
writer.write(json_string)
|
||||
|
||||
# Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary. Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
return copy.deepcopy(self.__dict__)
|
||||
|
||||
@ -289,11 +289,11 @@ class CacheConfig:
|
||||
returning all the unused kwargs.
|
||||
|
||||
Args:
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Dictionary of attributes to tentatively update this class.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
|
||||
`dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
|
||||
"""
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
@ -473,8 +473,8 @@ class DynamicCache(Cache):
|
||||
def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
|
||||
super().__init__()
|
||||
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
|
||||
# `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121
|
||||
# and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the
|
||||
@ -487,7 +487,7 @@ class DynamicCache(Cache):
|
||||
self.key_cache.append(key_states)
|
||||
self.value_cache.append(value_states)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
|
||||
sequence length.
|
||||
@ -517,8 +517,8 @@ class DynamicCache(Cache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
|
||||
|
||||
@ -529,7 +529,7 @@ class DynamicCache(Cache):
|
||||
The new value states to cache.
|
||||
layer_idx (`int`):
|
||||
The index of the layer to cache the states for.
|
||||
cache_kwargs (`Dict[str, Any]`, `optional`):
|
||||
cache_kwargs (`dict[str, Any]`, `optional`):
|
||||
Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
|
||||
|
||||
Return:
|
||||
@ -574,7 +574,7 @@ class DynamicCache(Cache):
|
||||
"""Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
|
||||
return None
|
||||
|
||||
def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
|
||||
def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
|
||||
"""Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
|
||||
backward compatibility."""
|
||||
legacy_cache = ()
|
||||
@ -584,7 +584,7 @@ class DynamicCache(Cache):
|
||||
|
||||
@classmethod
|
||||
def from_legacy_cache(
|
||||
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor, torch.FloatTensor]]] = None
|
||||
cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor, torch.FloatTensor]]] = None
|
||||
) -> "DynamicCache":
|
||||
"""Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
|
||||
backward compatibility."""
|
||||
@ -611,7 +611,7 @@ class DynamicCache(Cache):
|
||||
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
|
||||
self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
|
||||
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> list["DynamicCache"]:
|
||||
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
|
||||
`_split_model_inputs()` in `generation.utils`"""
|
||||
out = []
|
||||
@ -624,7 +624,7 @@ class DynamicCache(Cache):
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
|
||||
def from_batch_splits(cls, splits: list["DynamicCache"]) -> "DynamicCache":
|
||||
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
|
||||
`generation.utils`"""
|
||||
cache = cls()
|
||||
@ -762,7 +762,7 @@ class OffloadedCache(DynamicCache):
|
||||
self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
|
||||
self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
|
||||
if layer_idx < len(self):
|
||||
# Evict the previous layer if necessary
|
||||
@ -799,8 +799,8 @@ class OffloadedCache(DynamicCache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
|
||||
Parameters:
|
||||
@ -810,7 +810,7 @@ class OffloadedCache(DynamicCache):
|
||||
The new value states to cache.
|
||||
layer_idx (`int`):
|
||||
The index of the layer to cache the states for.
|
||||
cache_kwargs (`Dict[str, Any]`, `optional`):
|
||||
cache_kwargs (`dict[str, Any]`, `optional`):
|
||||
Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`.
|
||||
Return:
|
||||
A tuple containing the updated key and value states.
|
||||
@ -857,8 +857,8 @@ class QuantizedCache(DynamicCache):
|
||||
|
||||
def __init__(self, cache_config: QuantizedCacheConfig) -> None:
|
||||
super().__init__()
|
||||
self._quantized_key_cache: List[torch.Tensor] = []
|
||||
self._quantized_value_cache: List[torch.Tensor] = []
|
||||
self._quantized_key_cache: list[torch.Tensor] = []
|
||||
self._quantized_value_cache: list[torch.Tensor] = []
|
||||
|
||||
self.nbits = cache_config.nbits
|
||||
self.residual_length = cache_config.residual_length
|
||||
@ -875,8 +875,8 @@ class QuantizedCache(DynamicCache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Update the number of seen tokens
|
||||
if layer_idx == 0:
|
||||
self._seen_tokens += key_states.shape[-2]
|
||||
@ -1094,7 +1094,7 @@ class StaticCache(Cache):
|
||||
should pass the `layer_device_map` argument instead.
|
||||
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
||||
The default `dtype` to use when initializing the layer.
|
||||
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
@ -1129,7 +1129,7 @@ class StaticCache(Cache):
|
||||
max_cache_len: Optional[int] = None,
|
||||
device: Union[torch.device, str, None] = None,
|
||||
dtype: torch.dtype = torch.float32,
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.max_batch_size = max_batch_size
|
||||
@ -1145,8 +1145,8 @@ class StaticCache(Cache):
|
||||
else config.num_key_value_heads
|
||||
)
|
||||
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
# Note: There will be significant perf decrease if switching to use 5D tensors instead.
|
||||
cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
|
||||
device = torch.device(device) if device is not None else None
|
||||
@ -1169,8 +1169,8 @@ class StaticCache(Cache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
|
||||
It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
|
||||
@ -1182,7 +1182,7 @@ class StaticCache(Cache):
|
||||
The new value states to cache.
|
||||
layer_idx (`int`):
|
||||
The index of the layer to cache the states for.
|
||||
cache_kwargs (`Dict[str, Any]`, `optional`):
|
||||
cache_kwargs (`dict[str, Any]`, `optional`):
|
||||
Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
|
||||
to know how where to write in the cache.
|
||||
|
||||
@ -1260,7 +1260,7 @@ class SlidingWindowCache(StaticCache):
|
||||
should pass the `layer_device_map` argument instead.
|
||||
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
|
||||
The default `dtype` to use when initializing the layer.
|
||||
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
@ -1294,7 +1294,7 @@ class SlidingWindowCache(StaticCache):
|
||||
max_cache_len: Optional[int] = None,
|
||||
device: Union[torch.device, str, None] = None,
|
||||
dtype: torch.dtype = torch.float32,
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
) -> None:
|
||||
if not hasattr(config, "sliding_window") or config.sliding_window is None:
|
||||
raise ValueError(
|
||||
@ -1318,8 +1318,8 @@ class SlidingWindowCache(StaticCache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if cache_kwargs is None:
|
||||
cache_kwargs = {}
|
||||
cache_position = cache_kwargs.get("cache_position")
|
||||
@ -1400,7 +1400,7 @@ class EncoderDecoderCache(Cache):
|
||||
for layer_idx in range(len(cross_attention_cache.key_cache)):
|
||||
self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
|
||||
|
||||
def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
|
||||
sequence length.
|
||||
@ -1422,7 +1422,7 @@ class EncoderDecoderCache(Cache):
|
||||
"""
|
||||
return len(self.self_attention_cache)
|
||||
|
||||
def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor]]:
|
||||
def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
|
||||
"""Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
|
||||
legacy_cache = ()
|
||||
if len(self.cross_attention_cache) > 0:
|
||||
@ -1436,7 +1436,7 @@ class EncoderDecoderCache(Cache):
|
||||
|
||||
@classmethod
|
||||
def from_legacy_cache(
|
||||
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
) -> "EncoderDecoderCache":
|
||||
"""Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
|
||||
cache = cls(
|
||||
@ -1495,7 +1495,7 @@ class EncoderDecoderCache(Cache):
|
||||
self.check_dynamic_cache(self.crop.__name__)
|
||||
self.self_attention_cache.crop(maximum_length)
|
||||
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> "list[EncoderDecoderCache]":
|
||||
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
|
||||
`_split_model_inputs()` in `generation.utils`"""
|
||||
self.check_dynamic_cache(self.batch_split.__name__)
|
||||
@ -1508,7 +1508,7 @@ class EncoderDecoderCache(Cache):
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
|
||||
def from_batch_splits(cls, splits: list["EncoderDecoderCache"]) -> "EncoderDecoderCache":
|
||||
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
|
||||
`generation.utils`"""
|
||||
self_attention_cache = DynamicCache()
|
||||
@ -1569,7 +1569,7 @@ class HybridCache(Cache):
|
||||
should pass the `layer_device_map` argument instead.
|
||||
dtype (torch.dtype, *optional*, defaults to `torch.float32`):
|
||||
The default `dtype` to use when initializing the layer.
|
||||
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
@ -1603,7 +1603,7 @@ class HybridCache(Cache):
|
||||
max_cache_len: Optional[int] = None,
|
||||
device: Union[torch.device, str, None] = None,
|
||||
dtype: torch.dtype = torch.float32,
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
if not hasattr(config, "sliding_window") or config.sliding_window is None:
|
||||
@ -1634,8 +1634,8 @@ class HybridCache(Cache):
|
||||
else:
|
||||
self.is_sliding = [False] * config.num_hidden_layers
|
||||
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
global_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
|
||||
sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.sliding_window_len, self.head_dim)
|
||||
self.sliding_window = min(config.sliding_window, max_cache_len)
|
||||
@ -1660,8 +1660,8 @@ class HybridCache(Cache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if cache_kwargs is None:
|
||||
cache_kwargs = {}
|
||||
cache_position = cache_kwargs.get("cache_position")
|
||||
@ -1757,7 +1757,7 @@ class HybridChunkedCache(Cache):
|
||||
should pass the `layer_device_map` argument instead.
|
||||
dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`):
|
||||
The default `dtype` to use when initializing the layer.
|
||||
layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
@ -1791,7 +1791,7 @@ class HybridChunkedCache(Cache):
|
||||
max_cache_len: Optional[int] = None,
|
||||
device: Union[torch.device, str, None] = None,
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
if not hasattr(config, "sliding_window") or config.sliding_window is None:
|
||||
@ -1811,8 +1811,8 @@ class HybridChunkedCache(Cache):
|
||||
else:
|
||||
self.is_sliding = [False] * config.num_hidden_layers
|
||||
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
self.cumulative_length = [0 for _ in range(config.num_hidden_layers)]
|
||||
|
||||
def initialise_cache_layer(self, layer_idx, key_states):
|
||||
@ -1880,8 +1880,8 @@ class HybridChunkedCache(Cache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if cache_kwargs is None:
|
||||
cache_kwargs = {}
|
||||
cache_position = cache_kwargs.get("cache_position")
|
||||
@ -1968,7 +1968,7 @@ class OffloadedHybridCache(HybridChunkedCache):
|
||||
device: Union[torch.device, str, None] = None,
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
offload_device: Union[str, torch.device] = torch.device("cpu"),
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
):
|
||||
super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map)
|
||||
|
||||
@ -2121,8 +2121,8 @@ class MambaCache:
|
||||
self.ssm_state_size = config.state_size
|
||||
self.conv_kernel_size = config.conv_kernel
|
||||
|
||||
self.conv_states: List[torch.Tensor] = []
|
||||
self.ssm_states: List[torch.Tensor] = []
|
||||
self.conv_states: list[torch.Tensor] = []
|
||||
self.ssm_states: list[torch.Tensor] = []
|
||||
device = torch.device(device) if device is not None else None
|
||||
for _ in range(config.num_hidden_layers):
|
||||
conv_state: torch.Tensor = torch.zeros(
|
||||
@ -2193,7 +2193,7 @@ class OffloadedStaticCache(StaticCache):
|
||||
The default `dtype` to use when initializing the cache.
|
||||
offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`):
|
||||
The device to offload to. Defaults to CPU.
|
||||
layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
|
||||
layer_device_map (`dict[int, Union[str, torch.device, int]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
@ -2227,7 +2227,7 @@ class OffloadedStaticCache(StaticCache):
|
||||
device: Union[str, torch.device],
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
offload_device: Union[str, torch.device] = torch.device("cpu"),
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
) -> None:
|
||||
super(Cache, self).__init__()
|
||||
|
||||
@ -2255,8 +2255,8 @@ class OffloadedStaticCache(StaticCache):
|
||||
cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim)
|
||||
|
||||
# Create offloaded CPU tensors.
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
|
||||
for i in range(config.num_hidden_layers):
|
||||
# First layer is always on-device.
|
||||
@ -2268,8 +2268,8 @@ class OffloadedStaticCache(StaticCache):
|
||||
self.value_cache.append(value_cache)
|
||||
|
||||
# Create device tensors.
|
||||
self._device_key_cache: List[torch.Tensor] = []
|
||||
self._device_value_cache: List[torch.Tensor] = []
|
||||
self._device_key_cache: list[torch.Tensor] = []
|
||||
self._device_value_cache: list[torch.Tensor] = []
|
||||
|
||||
for i in range(2):
|
||||
key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, self.device)
|
||||
@ -2289,8 +2289,8 @@ class OffloadedStaticCache(StaticCache):
|
||||
key_states: torch.Tensor,
|
||||
value_states: torch.Tensor,
|
||||
layer_idx: int,
|
||||
cache_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
cache_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
|
||||
It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
|
||||
@ -2302,7 +2302,7 @@ class OffloadedStaticCache(StaticCache):
|
||||
The new value states to cache.
|
||||
layer_idx (`int`):
|
||||
The index of the layer to cache the states for.
|
||||
cache_kwargs (`Dict[str, Any]`, *optional*):
|
||||
cache_kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional arguments for the cache subclass. The `OffloadedStaticCache` needs the
|
||||
`cache_position` input to know how where to write in the cache.
|
||||
|
||||
@ -2401,13 +2401,13 @@ class OffloadedStaticCache(StaticCache):
|
||||
return self._seen_tokens
|
||||
|
||||
def _create_key_value_cache_tensors(
|
||||
self, shape: Tuple[int, ...], device: torch.device
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
self, shape: tuple[int, ...], device: torch.device
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Creates K/V cache tensors on a device. Pins memory for CPU tensors. Marks them as static
|
||||
addresses for non-CPU tensors.
|
||||
|
||||
Args:
|
||||
shape (`Tuple[int, ...]`): Shape.
|
||||
shape (`tuple[int, ...]`): Shape.
|
||||
device (`torch.device`): Device.
|
||||
|
||||
Returns:
|
||||
|
@ -23,7 +23,7 @@ from datetime import date
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from re import Pattern
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import yaml
|
||||
|
||||
@ -148,7 +148,7 @@ def find_indent(line: str) -> int:
|
||||
return len(search.groups()[0])
|
||||
|
||||
|
||||
def parse_module_content(content: str) -> List[str]:
|
||||
def parse_module_content(content: str) -> list[str]:
|
||||
"""
|
||||
Parse the content of a module in the list of objects it defines.
|
||||
|
||||
@ -156,7 +156,7 @@ def parse_module_content(content: str) -> List[str]:
|
||||
content (`str`): The content to parse
|
||||
|
||||
Returns:
|
||||
`List[str]`: The list of objects defined in the module.
|
||||
`list[str]`: The list of objects defined in the module.
|
||||
"""
|
||||
objects = []
|
||||
current_object = []
|
||||
@ -336,7 +336,7 @@ def add_content_to_file(
|
||||
|
||||
def replace_model_patterns(
|
||||
text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns
|
||||
) -> Tuple[str, str]:
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Replace all patterns present in a given text.
|
||||
|
||||
@ -414,10 +414,10 @@ def simplify_replacements(replacements):
|
||||
"BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed.
|
||||
|
||||
Args:
|
||||
replacements (`List[Tuple[str, str]]`): List of patterns (old, new)
|
||||
replacements (`list[tuple[str, str]]`): List of patterns (old, new)
|
||||
|
||||
Returns:
|
||||
`List[Tuple[str, str]]`: The list of patterns simplified.
|
||||
`list[tuple[str, str]]`: The list of patterns simplified.
|
||||
"""
|
||||
if len(replacements) <= 1:
|
||||
# Nothing to simplify
|
||||
@ -519,7 +519,7 @@ def duplicate_module(
|
||||
new_model_patterns: ModelPatterns,
|
||||
dest_file: Optional[str] = None,
|
||||
add_copied_from: bool = True,
|
||||
attrs_to_remove: Optional[List[str]] = None,
|
||||
attrs_to_remove: Optional[list[str]] = None,
|
||||
):
|
||||
"""
|
||||
Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
|
||||
@ -585,17 +585,17 @@ def duplicate_module(
|
||||
|
||||
|
||||
def filter_framework_files(
|
||||
files: List[Union[str, os.PathLike]], frameworks: Optional[List[str]] = None
|
||||
) -> List[Union[str, os.PathLike]]:
|
||||
files: list[Union[str, os.PathLike]], frameworks: Optional[list[str]] = None
|
||||
) -> list[Union[str, os.PathLike]]:
|
||||
"""
|
||||
Filter a list of files to only keep the ones corresponding to a list of frameworks.
|
||||
|
||||
Args:
|
||||
files (`List[Union[str, os.PathLike]]`): The list of files to filter.
|
||||
frameworks (`List[str]`, *optional*): The list of allowed frameworks.
|
||||
files (`list[Union[str, os.PathLike]]`): The list of files to filter.
|
||||
frameworks (`list[str]`, *optional*): The list of allowed frameworks.
|
||||
|
||||
Returns:
|
||||
`List[Union[str, os.PathLike]]`: The list of filtered files.
|
||||
`list[Union[str, os.PathLike]]`: The list of filtered files.
|
||||
"""
|
||||
if frameworks is None:
|
||||
frameworks = get_default_frameworks()
|
||||
@ -617,17 +617,17 @@ def filter_framework_files(
|
||||
return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others
|
||||
|
||||
|
||||
def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, Union[Path, List[Path]]]:
|
||||
def get_model_files(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, Union[Path, list[Path]]]:
|
||||
"""
|
||||
Retrieves all the files associated to a model.
|
||||
|
||||
Args:
|
||||
model_type (`str`): A valid model type (like "bert" or "gpt2")
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If passed, will only keep the model files corresponding to the passed frameworks.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Union[Path, List[Path]]]`: A dictionary with the following keys:
|
||||
`dict[str, Union[Path, list[Path]]]`: A dictionary with the following keys:
|
||||
- **doc_file** -- The documentation file for the model.
|
||||
- **model_files** -- All the files in the model module.
|
||||
- **test_files** -- The test files for the model.
|
||||
@ -663,14 +663,14 @@ _re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", fla
|
||||
|
||||
|
||||
def find_base_model_checkpoint(
|
||||
model_type: str, model_files: Optional[Dict[str, Union[Path, List[Path]]]] = None
|
||||
model_type: str, model_files: Optional[dict[str, Union[Path, list[Path]]]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Finds the model checkpoint used in the docstrings for a given model.
|
||||
|
||||
Args:
|
||||
model_type (`str`): A valid model type (like "bert" or "gpt2")
|
||||
model_files (`Dict[str, Union[Path, List[Path]]`, *optional*):
|
||||
model_files (`dict[str, Union[Path, list[Path]]`, *optional*):
|
||||
The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed.
|
||||
|
||||
Returns:
|
||||
@ -713,18 +713,18 @@ def get_default_frameworks():
|
||||
_re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES")
|
||||
|
||||
|
||||
def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, List[str]]:
|
||||
def retrieve_model_classes(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, list[str]]:
|
||||
"""
|
||||
Retrieve the model classes associated to a given model.
|
||||
|
||||
Args:
|
||||
model_type (`str`): A valid model type (like "bert" or "gpt2")
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict
|
||||
the classes returned.
|
||||
|
||||
Returns:
|
||||
`Dict[str, List[str]]`: A dictionary with one key per framework and the list of model classes associated to
|
||||
`dict[str, list[str]]`: A dictionary with one key per framework and the list of model classes associated to
|
||||
that framework as values.
|
||||
"""
|
||||
if frameworks is None:
|
||||
@ -754,20 +754,20 @@ def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = No
|
||||
return model_classes
|
||||
|
||||
|
||||
def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
|
||||
def retrieve_info_for_model(model_type, frameworks: Optional[list[str]] = None):
|
||||
"""
|
||||
Retrieves all the information from a given model_type.
|
||||
|
||||
Args:
|
||||
model_type (`str`): A valid model type (like "bert" or "gpt2")
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If passed, will only keep the info corresponding to the passed frameworks.
|
||||
|
||||
Returns:
|
||||
`Dict`: A dictionary with the following keys:
|
||||
- **frameworks** (`List[str]`): The list of frameworks that back this model type.
|
||||
- **model_classes** (`Dict[str, List[str]]`): The model classes implemented for that model type.
|
||||
- **model_files** (`Dict[str, Union[Path, List[Path]]]`): The files associated with that model type.
|
||||
- **frameworks** (`list[str]`): The list of frameworks that back this model type.
|
||||
- **model_classes** (`dict[str, list[str]]`): The model classes implemented for that model type.
|
||||
- **model_files** (`dict[str, Union[Path, list[Path]]]`): The files associated with that model type.
|
||||
- **model_patterns** (`ModelPatterns`): The various patterns for the model.
|
||||
"""
|
||||
if model_type not in auto_module.MODEL_NAMES_MAPPING:
|
||||
@ -833,7 +833,7 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
|
||||
|
||||
|
||||
def clean_frameworks_in_init(
|
||||
init_file: Union[str, os.PathLike], frameworks: Optional[List[str]] = None, keep_processing: bool = True
|
||||
init_file: Union[str, os.PathLike], frameworks: Optional[list[str]] = None, keep_processing: bool = True
|
||||
):
|
||||
"""
|
||||
Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature
|
||||
@ -841,7 +841,7 @@ def clean_frameworks_in_init(
|
||||
|
||||
Args:
|
||||
init_file (`str` or `os.PathLike`): The path to the init to treat.
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If passed, this will remove all imports that are subject to a framework not in frameworks
|
||||
keep_processing (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports
|
||||
@ -914,7 +914,7 @@ def clean_frameworks_in_init(
|
||||
def add_model_to_main_init(
|
||||
old_model_patterns: ModelPatterns,
|
||||
new_model_patterns: ModelPatterns,
|
||||
frameworks: Optional[List[str]] = None,
|
||||
frameworks: Optional[list[str]] = None,
|
||||
with_processing: bool = True,
|
||||
):
|
||||
"""
|
||||
@ -923,7 +923,7 @@ def add_model_to_main_init(
|
||||
Args:
|
||||
old_model_patterns (`ModelPatterns`): The patterns for the old model.
|
||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If specified, only the models implemented in those frameworks will be added.
|
||||
with_processing (`bool`, *optional*, defaults to `True`):
|
||||
Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
|
||||
@ -1068,7 +1068,7 @@ AUTO_CLASSES_PATTERNS = {
|
||||
|
||||
|
||||
def add_model_to_auto_classes(
|
||||
old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: Dict[str, List[str]]
|
||||
old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: dict[str, list[str]]
|
||||
):
|
||||
"""
|
||||
Add a model to the relevant mappings in the auto module.
|
||||
@ -1076,7 +1076,7 @@ def add_model_to_auto_classes(
|
||||
Args:
|
||||
old_model_patterns (`ModelPatterns`): The patterns for the old model.
|
||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||
model_classes (`Dict[str, List[str]]`): A dictionary framework to list of model classes implemented.
|
||||
model_classes (`dict[str, list[str]]`): A dictionary framework to list of model classes implemented.
|
||||
"""
|
||||
for filename in AUTO_CLASSES_PATTERNS:
|
||||
# Extend patterns with all model classes if necessary
|
||||
@ -1169,7 +1169,7 @@ def duplicate_doc_file(
|
||||
old_model_patterns: ModelPatterns,
|
||||
new_model_patterns: ModelPatterns,
|
||||
dest_file: Optional[Union[str, os.PathLike]] = None,
|
||||
frameworks: Optional[List[str]] = None,
|
||||
frameworks: Optional[list[str]] = None,
|
||||
):
|
||||
"""
|
||||
Duplicate a documentation file and adapts it for a new model.
|
||||
@ -1180,7 +1180,7 @@ def duplicate_doc_file(
|
||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||
dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
|
||||
Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
|
||||
"""
|
||||
with open(doc_file, "r", encoding="utf-8") as f:
|
||||
@ -1320,7 +1320,7 @@ def create_new_model_like(
|
||||
model_type: str,
|
||||
new_model_patterns: ModelPatterns,
|
||||
add_copied_from: bool = True,
|
||||
frameworks: Optional[List[str]] = None,
|
||||
frameworks: Optional[list[str]] = None,
|
||||
old_checkpoint: Optional[str] = None,
|
||||
create_fast_image_processor: bool = False,
|
||||
):
|
||||
@ -1332,7 +1332,7 @@ def create_new_model_like(
|
||||
new_model_patterns (`ModelPatterns`): The patterns for the new model.
|
||||
add_copied_from (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to add "Copied from" statements to all classes in the new model modeling files.
|
||||
frameworks (`List[str]`, *optional*):
|
||||
frameworks (`list[str]`, *optional*):
|
||||
If passed, will limit the duplicate to the frameworks specified.
|
||||
old_checkpoint (`str`, *optional*):
|
||||
The name of the base checkpoint for the old model. Should be passed along when it can't be automatically
|
||||
|
@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from typing import Any, List, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..pipelines import Pipeline, get_supported_tasks, pipeline
|
||||
from ..utils import logging
|
||||
@ -69,8 +69,8 @@ class ServeTokenizeResult(BaseModel):
|
||||
Tokenize result model
|
||||
"""
|
||||
|
||||
tokens: List[str]
|
||||
tokens_ids: Optional[List[int]]
|
||||
tokens: list[str]
|
||||
tokens_ids: Optional[list[int]]
|
||||
|
||||
|
||||
class ServeDeTokenizeResult(BaseModel):
|
||||
@ -196,7 +196,7 @@ class ServeCommand(BaseTransformersCLICommand):
|
||||
|
||||
def detokenize(
|
||||
self,
|
||||
tokens_ids: List[int] = Body(None, embed=True),
|
||||
tokens_ids: list[int] = Body(None, embed=True),
|
||||
skip_special_tokens: bool = Body(False, embed=True),
|
||||
cleanup_tokenization_spaces: bool = Body(True, embed=True),
|
||||
):
|
||||
|
@ -63,13 +63,13 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Some configurations requires inputs to be defined at init and have no default values, usually these are composite configs,
|
||||
(but not necessarily) such as [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`]. They have to be initialized from
|
||||
two or more configs of type [`~transformers.PretrainedConfig`].
|
||||
- **keys_to_ignore_at_inference** (`List[str]`) -- A list of keys to ignore by default when looking at dictionary
|
||||
- **keys_to_ignore_at_inference** (`list[str]`) -- A list of keys to ignore by default when looking at dictionary
|
||||
outputs of the model during inference.
|
||||
- **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
|
||||
- **attribute_map** (`dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
|
||||
naming of attributes.
|
||||
- **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
|
||||
- **base_model_tp_plan** (`dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
|
||||
parallel plan applied to the sub-module when `model.tensor_parallel` is called.
|
||||
- **base_model_pp_plan** (`Dict[str, Tuple[List[str]]]`) -- A dict that maps child-modules of a base model to a
|
||||
- **base_model_pp_plan** (`dict[str, tuple[list[str]]]`) -- A dict that maps child-modules of a base model to a
|
||||
pipeline parallel plan that enables users to place the child-module on the appropriate device.
|
||||
|
||||
Common attributes (present in all subclasses):
|
||||
@ -115,7 +115,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
|
||||
and decoder model to have the exact same parameter names.
|
||||
prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
|
||||
prune_heads (`dict[int, list[int]]`, *optional*, defaults to `{}`):
|
||||
Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
|
||||
heads to prune in said layer.
|
||||
|
||||
@ -128,17 +128,17 @@ class PretrainedConfig(PushToHubMixin):
|
||||
|
||||
> Parameters for fine-tuning tasks
|
||||
|
||||
architectures (`List[str]`, *optional*):
|
||||
architectures (`list[str]`, *optional*):
|
||||
Model architectures that can be used with the model pretrained weights.
|
||||
finetuning_task (`str`, *optional*):
|
||||
Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
|
||||
or PyTorch) checkpoint.
|
||||
id2label (`Dict[int, str]`, *optional*):
|
||||
id2label (`dict[int, str]`, *optional*):
|
||||
A map from index (for instance prediction index, or target index) to label.
|
||||
label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
|
||||
label2id (`dict[str, int]`, *optional*): A map from label to index for the model.
|
||||
num_labels (`int`, *optional*):
|
||||
Number of labels to use in the last layer added to the model, typically for a classification task.
|
||||
task_specific_params (`Dict[str, Any]`, *optional*):
|
||||
task_specific_params (`dict[str, Any]`, *optional*):
|
||||
Additional keyword arguments to store for the current task.
|
||||
problem_type (`str`, *optional*):
|
||||
Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
|
||||
@ -394,7 +394,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
self._set_token_in_kwargs(kwargs)
|
||||
@ -505,7 +505,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
@ -531,7 +531,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
|
||||
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the `return_unused_kwargs` keyword parameter.
|
||||
@ -599,7 +599,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
|
||||
`tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
|
||||
|
||||
"""
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
@ -723,10 +723,10 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
config_dict (`Dict[str, Any]`):
|
||||
config_dict (`dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Additional parameters from which to initialize the configuration object.
|
||||
|
||||
Returns:
|
||||
@ -816,7 +816,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Python dictionary.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
|
||||
dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
|
||||
@ -874,7 +874,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
if hasattr(self.__class__, "model_type"):
|
||||
@ -940,7 +940,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
Updates attributes of this class with attributes from `config_dict`.
|
||||
|
||||
Args:
|
||||
config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
|
||||
config_dict (`dict[str, Any]`): Dictionary of attributes that should be updated for this class.
|
||||
"""
|
||||
for key, value in config_dict.items():
|
||||
setattr(self, key, value)
|
||||
@ -1163,7 +1163,7 @@ def get_configuration_file(configuration_files: list[str]) -> str:
|
||||
Get the configuration file to use for this version of transformers.
|
||||
|
||||
Args:
|
||||
configuration_files (`List[str]`): The list of available configuration files.
|
||||
configuration_files (`list[str]`): The list of available configuration files.
|
||||
|
||||
Returns:
|
||||
`str`: The configuration file to use.
|
||||
|
@ -18,7 +18,7 @@ import warnings
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from random import randint
|
||||
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
|
||||
from typing import Any, Callable, NewType, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
@ -33,7 +33,7 @@ InputDataClass = NewType("InputDataClass", Any)
|
||||
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
|
||||
of PyTorch/TensorFlow tensors or NumPy arrays.
|
||||
"""
|
||||
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
|
||||
DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])
|
||||
|
||||
|
||||
class DataCollatorMixin:
|
||||
@ -72,7 +72,7 @@ def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
|
||||
return padded
|
||||
|
||||
|
||||
def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
|
||||
def default_data_collator(features: list[InputDataClass], return_tensors="pt") -> dict[str, Any]:
|
||||
"""
|
||||
Very simple data collator that simply collates batches of dict-like objects and performs special handling for
|
||||
potential keys named:
|
||||
@ -119,13 +119,13 @@ class DefaultDataCollator(DataCollatorMixin):
|
||||
|
||||
return_tensors: str = "pt"
|
||||
|
||||
def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
|
||||
def __call__(self, features: list[dict[str, Any]], return_tensors=None) -> dict[str, Any]:
|
||||
if return_tensors is None:
|
||||
return_tensors = self.return_tensors
|
||||
return default_data_collator(features, return_tensors)
|
||||
|
||||
|
||||
def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
|
||||
def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
|
||||
import torch
|
||||
|
||||
if not isinstance(features[0], Mapping):
|
||||
@ -161,7 +161,7 @@ def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any
|
||||
return batch
|
||||
|
||||
|
||||
def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
|
||||
def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
|
||||
import tensorflow as tf
|
||||
|
||||
if not isinstance(features[0], Mapping):
|
||||
@ -202,7 +202,7 @@ def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
|
||||
return batch
|
||||
|
||||
|
||||
def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
|
||||
def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
|
||||
if not isinstance(features[0], Mapping):
|
||||
features = [vars(f) for f in features]
|
||||
first = features[0]
|
||||
@ -268,7 +268,7 @@ class DataCollatorWithPadding:
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
return_tensors: str = "pt"
|
||||
|
||||
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
batch = pad_without_fast_tokenizer_warning(
|
||||
self.tokenizer,
|
||||
features,
|
||||
@ -569,7 +569,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
return_tensors: str = "pt"
|
||||
|
||||
def torch_call(self, examples: List[Dict[str, Any]]): # Refactored implementation from the docs.
|
||||
def torch_call(self, examples: list[dict[str, Any]]): # Refactored implementation from the docs.
|
||||
import torch
|
||||
|
||||
# Take labels out of the examples beforehand, because they aren't nested.
|
||||
@ -911,7 +911,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
|
||||
def tf_mask_tokens(
|
||||
self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
|
||||
) -> Tuple[Any, Any]:
|
||||
) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
@ -956,7 +956,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
# The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
import tensorflow as tf
|
||||
|
||||
if self.seed and self.generator is None:
|
||||
@ -1002,7 +1002,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
batch["labels"] = labels
|
||||
return batch
|
||||
|
||||
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
# Handle dict or lists with proper padding and conversion to tensor.
|
||||
|
||||
if self.seed and self.generator is None:
|
||||
@ -1032,7 +1032,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
batch["labels"] = labels
|
||||
return batch
|
||||
|
||||
def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
|
||||
def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
@ -1081,7 +1081,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
# The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
# Handle dict or lists with proper padding and conversion to tensor.
|
||||
|
||||
if self.seed and self.generator is None:
|
||||
@ -1111,7 +1111,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
batch["labels"] = labels
|
||||
return batch
|
||||
|
||||
def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
|
||||
def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
@ -1193,7 +1193,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
|
||||
</Tip>"""
|
||||
|
||||
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
if self.seed and self.generator is None:
|
||||
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
|
||||
# If no seed supplied, we will use the global RNG
|
||||
@ -1226,7 +1226,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
|
||||
return {"input_ids": inputs, "labels": labels}
|
||||
|
||||
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
import tensorflow as tf
|
||||
|
||||
if self.seed and self.generator is None:
|
||||
@ -1261,7 +1261,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
|
||||
return {"input_ids": inputs, "labels": labels}
|
||||
|
||||
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
if self.seed and self.generator is None:
|
||||
# If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
|
||||
# If no seed supplied, we will use the global RNG
|
||||
@ -1318,7 +1318,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
self.generator.shuffle(cand_indexes)
|
||||
return cand_indexes
|
||||
|
||||
def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
|
||||
def _whole_word_mask(self, input_tokens: list[str], max_predictions=512):
|
||||
"""
|
||||
Get 0/1 labels for masked tokens with whole word mask proxy
|
||||
"""
|
||||
@ -1358,7 +1358,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
|
||||
return mask_labels
|
||||
|
||||
def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
|
||||
def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
|
||||
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
@ -1414,7 +1414,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
# The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
|
||||
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
|
||||
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
@ -1474,7 +1474,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
# The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
|
||||
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
|
||||
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
@ -1564,7 +1564,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
FutureWarning,
|
||||
)
|
||||
|
||||
def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
def __call__(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
|
||||
@ -1587,7 +1587,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
"sentence_order_label": sentence_order_label,
|
||||
}
|
||||
|
||||
def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
|
||||
def mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
|
||||
original. N-gram not applied yet.
|
||||
@ -1645,28 +1645,28 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
|
||||
max_span_length: int = 5 # maximum length of a span of masked tokens
|
||||
return_tensors: str = "pt"
|
||||
|
||||
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
if isinstance(examples[0], Mapping):
|
||||
examples = [e["input_ids"] for e in examples]
|
||||
batch = _torch_collate_batch(examples, self.tokenizer)
|
||||
inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
|
||||
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
|
||||
|
||||
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
if isinstance(examples[0], Mapping):
|
||||
examples = [e["input_ids"] for e in examples]
|
||||
batch = _tf_collate_batch(examples, self.tokenizer)
|
||||
inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
|
||||
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
|
||||
|
||||
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
|
||||
def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
|
||||
if isinstance(examples[0], Mapping):
|
||||
examples = [e["input_ids"] for e in examples]
|
||||
batch = _numpy_collate_batch(examples, self.tokenizer)
|
||||
inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
|
||||
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
|
||||
|
||||
def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
|
||||
def torch_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
|
||||
"""
|
||||
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
|
||||
|
||||
@ -1765,7 +1765,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
|
||||
|
||||
return inputs.long(), perm_mask, target_mapping, labels.long()
|
||||
|
||||
def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
|
||||
def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
|
||||
"""
|
||||
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
|
||||
|
||||
@ -1872,7 +1872,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
|
||||
|
||||
return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
|
||||
|
||||
def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
|
||||
def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
|
||||
"""
|
||||
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
|
||||
|
||||
|
@ -17,7 +17,7 @@ import time
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from filelock import FileLock
|
||||
@ -75,7 +75,7 @@ class GlueDataset(Dataset):
|
||||
|
||||
args: GlueDataTrainingArguments
|
||||
output_mode: str
|
||||
features: List[InputFeatures]
|
||||
features: list[InputFeatures]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -18,7 +18,7 @@ import pickle
|
||||
import random
|
||||
import time
|
||||
import warnings
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from filelock import FileLock
|
||||
@ -139,7 +139,7 @@ class LineByLineTextDataset(Dataset):
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> Dict[str, torch.tensor]:
|
||||
def __getitem__(self, i) -> dict[str, torch.tensor]:
|
||||
return self.examples[i]
|
||||
|
||||
|
||||
@ -187,7 +187,7 @@ class LineByLineWithRefDataset(Dataset):
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> Dict[str, torch.tensor]:
|
||||
def __getitem__(self, i) -> dict[str, torch.tensor]:
|
||||
return self.examples[i]
|
||||
|
||||
|
||||
@ -339,7 +339,7 @@ class LineByLineWithSOPTextDataset(Dataset):
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> Dict[str, torch.tensor]:
|
||||
def __getitem__(self, i) -> dict[str, torch.tensor]:
|
||||
return self.examples[i]
|
||||
|
||||
|
||||
@ -433,7 +433,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
|
||||
f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
|
||||
)
|
||||
|
||||
def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
|
||||
def create_examples_from_document(self, document: list[list[int]], doc_index: int, block_size: int):
|
||||
"""Creates examples for a single document."""
|
||||
|
||||
max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
|
||||
|
@ -16,7 +16,7 @@ import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from filelock import FileLock
|
||||
@ -112,7 +112,7 @@ class SquadDataset(Dataset):
|
||||
"""
|
||||
|
||||
args: SquadDataTrainingArguments
|
||||
features: List[SquadFeatures]
|
||||
features: list[SquadFeatures]
|
||||
mode: Split
|
||||
is_language_sensitive: bool
|
||||
|
||||
@ -195,7 +195,7 @@ class SquadDataset(Dataset):
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
||||
def __getitem__(self, i) -> dict[str, torch.Tensor]:
|
||||
# Convert to Tensors and build dataset
|
||||
feature = self.features[i]
|
||||
|
||||
|
@ -19,7 +19,7 @@ import os
|
||||
import warnings
|
||||
from dataclasses import asdict
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...utils import is_tf_available, logging
|
||||
@ -39,7 +39,7 @@ DEPRECATION_WARNING = (
|
||||
|
||||
|
||||
def glue_convert_examples_to_features(
|
||||
examples: Union[List[InputExample], "tf.data.Dataset"],
|
||||
examples: Union[list[InputExample], "tf.data.Dataset"],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
max_length: Optional[int] = None,
|
||||
task=None,
|
||||
@ -107,7 +107,7 @@ if is_tf_available():
|
||||
|
||||
|
||||
def _glue_convert_examples_to_features(
|
||||
examples: List[InputExample],
|
||||
examples: list[InputExample],
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
max_length: Optional[int] = None,
|
||||
task=None,
|
||||
|
@ -18,7 +18,7 @@ import csv
|
||||
import dataclasses
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from ...utils import is_tf_available, is_torch_available, logging
|
||||
|
||||
@ -67,9 +67,9 @@ class InputFeatures:
|
||||
float for regression problems.
|
||||
"""
|
||||
|
||||
input_ids: List[int]
|
||||
attention_mask: Optional[List[int]] = None
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
input_ids: list[int]
|
||||
attention_mask: Optional[list[int]] = None
|
||||
token_type_ids: Optional[list[int]] = None
|
||||
label: Optional[Union[int, float]] = None
|
||||
|
||||
def to_json_string(self):
|
||||
|
@ -136,7 +136,7 @@ class DebugUnderflowOverflow:
|
||||
The model to debug.
|
||||
max_frames_to_save (`int`, *optional*, defaults to 21):
|
||||
How many frames back to record
|
||||
trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
|
||||
trace_batch_nums(`list[int]`, *optional*, defaults to `[]`):
|
||||
Which batch numbers to trace (turns detection off)
|
||||
abort_after_batch_num (`int``, *optional*):
|
||||
Whether to abort after a certain batch number has finished
|
||||
|
@ -317,7 +317,7 @@ def get_cached_module_file(
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or *bool*, *optional*):
|
||||
@ -507,7 +507,7 @@ def get_class_from_dynamic_module(
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
@ -593,7 +593,7 @@ def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Option
|
||||
A config in which to register the auto_map corresponding to this custom object.
|
||||
|
||||
Returns:
|
||||
`List[str]`: The list of files saved.
|
||||
`list[str]`: The list of files saved.
|
||||
"""
|
||||
if obj.__module__ == "__main__":
|
||||
logger.warning(
|
||||
@ -762,7 +762,7 @@ def check_python_requirements(path_or_repo_id, requirements_file="requirements.t
|
||||
This can be either:
|
||||
- a string, the *model id* of a model repo on huggingface.co.
|
||||
- a path to a *directory* potentially containing the file.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional arguments to pass to `cached_file`.
|
||||
"""
|
||||
failed = [] # error messages regarding requirements
|
||||
|
@ -81,13 +81,13 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
|
||||
Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of
|
||||
input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str,
|
||||
List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
|
||||
processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
|
||||
Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
|
||||
input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
|
||||
list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
|
||||
collate function.
|
||||
|
||||
Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||
Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||
see the note above for the return type.
|
||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
@ -235,9 +235,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
|
||||
Pad inputs (on left/right and up to predefined length or max length in the batch)
|
||||
|
||||
Args:
|
||||
processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
|
||||
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
|
||||
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
|
||||
processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
|
||||
Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
|
||||
of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
|
||||
max_length (`int`, *optional*):
|
||||
Maximum length of the returned list and optionally padding length (see below)
|
||||
padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
|
||||
@ -306,9 +306,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
|
||||
Truncate inputs to predefined length or max length in the batch
|
||||
|
||||
Args:
|
||||
processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
|
||||
Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
|
||||
of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
|
||||
processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
|
||||
Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
|
||||
of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
|
||||
max_length (`int`, *optional*):
|
||||
maximum length of the returned list and optionally padding length (see below)
|
||||
pad_to_multiple_of (`int`, *optional*) :
|
||||
|
@ -303,7 +303,7 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
@ -326,7 +326,7 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
|
||||
consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
|
||||
`kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are feature extractor attributes will be used to override the
|
||||
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
|
||||
controlled by the `return_unused_kwargs` keyword parameter.
|
||||
@ -392,7 +392,7 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
@ -454,7 +454,7 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
|
||||
`tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
|
||||
"""
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
@ -555,11 +555,11 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
parameters.
|
||||
|
||||
Args:
|
||||
feature_extractor_dict (`Dict[str, Any]`):
|
||||
feature_extractor_dict (`dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the
|
||||
[`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Additional parameters from which to initialize the feature extractor object.
|
||||
|
||||
Returns:
|
||||
@ -588,7 +588,7 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary. Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output["feature_extractor_type"] = self.__class__.__name__
|
||||
|
@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Constraint(ABC):
|
||||
@ -51,7 +51,7 @@ class Constraint(ABC):
|
||||
When called, returns the token(s) that would take this constraint one step closer to being fulfilled.
|
||||
|
||||
Return:
|
||||
token_ids (Union[int, List[int], None]):
|
||||
token_ids (Union[int, list[int], None]):
|
||||
- A single token ID (int) that advances the constraint, or
|
||||
- A list of token IDs that could advance the constraint
|
||||
- None if the constraint is completed or cannot be advanced
|
||||
@ -134,11 +134,11 @@ class PhrasalConstraint(Constraint):
|
||||
[`Constraint`] enforcing that an ordered sequence of tokens is included in the output.
|
||||
|
||||
Args:
|
||||
token_ids (`List[int]`):
|
||||
token_ids (`list[int]`):
|
||||
The id of the token that must be generated by the output.
|
||||
"""
|
||||
|
||||
def __init__(self, token_ids: List[int]):
|
||||
def __init__(self, token_ids: list[int]):
|
||||
super(Constraint, self).__init__()
|
||||
|
||||
if not isinstance(token_ids, list) or len(token_ids) == 0:
|
||||
@ -205,7 +205,7 @@ class PhrasalConstraint(Constraint):
|
||||
|
||||
|
||||
class DisjunctiveTrie:
|
||||
def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
|
||||
def __init__(self, nested_token_ids: list[list[int]], no_subsets=True):
|
||||
r"""
|
||||
A helper class that builds a trie with the words represented in `nested_token_ids`.
|
||||
"""
|
||||
@ -266,12 +266,12 @@ class DisjunctiveConstraint(Constraint):
|
||||
A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
|
||||
|
||||
Args:
|
||||
nested_token_ids (`List[List[int]]`):
|
||||
nested_token_ids (`list[list[int]]`):
|
||||
A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
|
||||
the list of words.
|
||||
"""
|
||||
|
||||
def __init__(self, nested_token_ids: List[List[int]]):
|
||||
def __init__(self, nested_token_ids: list[list[int]]):
|
||||
super(Constraint, self).__init__()
|
||||
|
||||
if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0:
|
||||
@ -356,11 +356,11 @@ class ConstraintListState:
|
||||
A class for beam scorers to track its progress through a list of constraints.
|
||||
|
||||
Args:
|
||||
constraints (`List[Constraint]`):
|
||||
constraints (`list[Constraint]`):
|
||||
A list of [`Constraint`] objects that must be fulfilled by the beam scorer.
|
||||
"""
|
||||
|
||||
def __init__(self, constraints: List[Constraint]):
|
||||
def __init__(self, constraints: list[Constraint]):
|
||||
self.constraints = constraints
|
||||
|
||||
# max # of steps required to fulfill a given constraint
|
||||
@ -418,7 +418,7 @@ class ConstraintListState:
|
||||
else:
|
||||
return token_list
|
||||
|
||||
def reset(self, token_ids: Optional[List[int]]):
|
||||
def reset(self, token_ids: Optional[list[int]]):
|
||||
"""
|
||||
token_ids: the tokens generated thus far to reset the state of the progress through constraints.
|
||||
"""
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import UserDict
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -41,7 +41,7 @@ PROCESS_INPUTS_DOCSTRING = r"""
|
||||
Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
beam_indices (`torch.LongTensor`, *optional*):
|
||||
Beam indices indicating to which beam hypothesis each token correspond.
|
||||
@ -77,7 +77,7 @@ FINALIZE_INPUTS_DOCSTRING = r"""
|
||||
The beam indices indicating to which beam the `final_beam_tokens` shall be added.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
|
||||
Return:
|
||||
@ -103,7 +103,7 @@ class BeamScorer(ABC):
|
||||
next_tokens: torch.LongTensor,
|
||||
next_indices: torch.LongTensor,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
raise NotImplementedError("This is an abstract method.")
|
||||
|
||||
@abstractmethod
|
||||
@ -219,11 +219,11 @@ class BeamSearchScorer(BeamScorer):
|
||||
next_tokens: torch.LongTensor,
|
||||
next_indices: torch.LongTensor,
|
||||
pad_token_id: Optional[Union[int, torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
|
||||
beam_indices: Optional[torch.LongTensor] = None,
|
||||
group_index: Optional[int] = 0,
|
||||
decoder_prompt_len: Optional[int] = 0,
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
) -> dict[str, torch.Tensor]:
|
||||
# add up to the length which the next_scores is calculated on (including decoder prompt)
|
||||
cur_len = input_ids.shape[-1] + 1
|
||||
batch_size = len(self._beam_hyps) // self.num_beam_groups
|
||||
@ -325,10 +325,10 @@ class BeamSearchScorer(BeamScorer):
|
||||
final_beam_indices: torch.LongTensor,
|
||||
max_length: int,
|
||||
pad_token_id: Optional[Union[int, torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
|
||||
beam_indices: Optional[torch.LongTensor] = None,
|
||||
decoder_prompt_len: Optional[int] = 0,
|
||||
) -> Tuple[torch.LongTensor]:
|
||||
) -> tuple[torch.LongTensor]:
|
||||
batch_size = len(self._beam_hyps) // self.num_beam_groups
|
||||
|
||||
if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
|
||||
@ -426,7 +426,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
|
||||
num_beams (`int`):
|
||||
Number of beams for beam search.
|
||||
constraints (`List[Constraint]`):
|
||||
constraints (`list[Constraint]`):
|
||||
A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation
|
||||
output. For more information, the documentation of [`Constraint`] should be read.
|
||||
device (`torch.device`):
|
||||
@ -457,7 +457,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
self,
|
||||
batch_size: int,
|
||||
num_beams: int,
|
||||
constraints: List[Constraint],
|
||||
constraints: list[Constraint],
|
||||
device: torch.device,
|
||||
length_penalty: Optional[float] = 1.0,
|
||||
do_early_stopping: Optional[Union[bool, str]] = False,
|
||||
@ -518,10 +518,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
next_indices: torch.LongTensor,
|
||||
scores_for_all_vocab: torch.FloatTensor,
|
||||
pad_token_id: Optional[Union[int, torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
|
||||
beam_indices: Optional[torch.LongTensor] = None,
|
||||
decoder_prompt_len: Optional[int] = 0,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
|
||||
@ -541,7 +541,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
The scores of all tokens in the vocabulary for each of the beam hypotheses.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
beam_indices (`torch.LongTensor`, *optional*):
|
||||
Beam indices indicating to which beam hypothesis each token correspond.
|
||||
@ -818,10 +818,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
|
||||
final_beam_indices: torch.LongTensor,
|
||||
max_length: int,
|
||||
pad_token_id: Optional[Union[int, torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
|
||||
eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
|
||||
beam_indices: Optional[torch.LongTensor] = None,
|
||||
decoder_prompt_len: Optional[int] = 0,
|
||||
) -> Tuple[torch.LongTensor]:
|
||||
) -> tuple[torch.LongTensor]:
|
||||
batch_size = len(self._beam_hyps)
|
||||
|
||||
if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
import copy
|
||||
import weakref
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -44,7 +44,7 @@ from ..utils.deprecation import deprecate_kwarg
|
||||
class CandidateGenerator:
|
||||
"""Abstract base class for all candidate generators that can be applied during assisted generation."""
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""
|
||||
Fetches the candidates to be tried for the current input.
|
||||
|
||||
@ -108,7 +108,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
|
||||
input_ids: torch.LongTensor,
|
||||
assistant_model: "PreTrainedModel",
|
||||
generation_config: "GenerationConfig",
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
inputs_tensor: Optional[torch.Tensor] = None,
|
||||
logits_processor: "LogitsProcessorList" = None,
|
||||
):
|
||||
@ -198,7 +198,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
|
||||
self.probs = []
|
||||
self.matches = []
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""
|
||||
Fetches the candidates to be tried for the current input.
|
||||
|
||||
@ -281,7 +281,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
|
||||
|
||||
self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold
|
||||
|
||||
def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
|
||||
def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> tuple[int, int]:
|
||||
"""Calculate the minimum and maximum number of new tokens to generate."""
|
||||
new_cur_len = input_ids.shape[-1]
|
||||
max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
|
||||
@ -305,7 +305,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
|
||||
|
||||
return has_past_key_values
|
||||
|
||||
def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
|
||||
def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> dict:
|
||||
"""Prepare arguments for the generation call."""
|
||||
return {
|
||||
self.input_ids_key: input_ids,
|
||||
@ -315,7 +315,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
|
||||
"logits_processor": self.logits_processor,
|
||||
}
|
||||
|
||||
def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def _generate_candidates(self, generation_args: dict) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""Generate candidate sequences using the assistant model."""
|
||||
assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
|
||||
self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
|
||||
@ -374,7 +374,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
|
||||
target_tokenizer: "PreTrainedTokenizerBase",
|
||||
assistant_tokenizer: "PreTrainedTokenizerBase",
|
||||
generation_config: "GenerationConfig",
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
inputs_tensor: Optional[torch.Tensor] = None,
|
||||
logits_processor: "LogitsProcessorList" = None,
|
||||
):
|
||||
@ -495,7 +495,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
|
||||
dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
|
||||
return dest_ids.to(input_ids.device)
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""
|
||||
Fetches the candidates to be tried for the current input.
|
||||
|
||||
@ -537,7 +537,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
|
||||
|
||||
return new_target_ids, None
|
||||
|
||||
def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]:
|
||||
def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, int]:
|
||||
"""Converts target input IDs to assistant input IDs, handling discrepancies."""
|
||||
convert_kwargs = {
|
||||
"source_tokenizer": self.target_tokenizer,
|
||||
@ -782,7 +782,7 @@ class AssistantToTargetTranslator:
|
||||
|
||||
max_assistant_index = max(assistant_vocab.values())
|
||||
assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.SUPPRESS_TOKEN_ID, dtype=int)
|
||||
target_to_assistant_input_ids: Dict[int, int] = {}
|
||||
target_to_assistant_input_ids: dict[int, int] = {}
|
||||
for tok, assistant_id in assistant_vocab.items():
|
||||
target_id = target_vocab.get(tok)
|
||||
if target_id is not None:
|
||||
@ -909,7 +909,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
|
||||
target_tokenizer: "PreTrainedTokenizerBase",
|
||||
assistant_tokenizer: "PreTrainedTokenizerBase",
|
||||
generation_config: "GenerationConfig",
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
atm_translator: AssistantToTargetTranslator,
|
||||
inputs_tensor: Optional[torch.Tensor] = None,
|
||||
logits_processor: "LogitsProcessorList" = None,
|
||||
@ -930,7 +930,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
|
||||
self._target_seq_len_with_candidates: int = 0
|
||||
self._prev_assistant_ids: Optional[torch.LongTensor] = None
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""
|
||||
Simplified version of get_candidates that uses the translator cache for token conversion.
|
||||
"""
|
||||
@ -1043,7 +1043,7 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
|
||||
if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
|
||||
raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
"""
|
||||
Fetches the candidates to be tried for the current input.
|
||||
|
||||
@ -1153,7 +1153,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
|
||||
input_ids: torch.LongTensor,
|
||||
assistant_model: "PreTrainedModel",
|
||||
generation_config: "GenerationConfig",
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
inputs_tensor: Optional[torch.Tensor] = None,
|
||||
logits_processor: "LogitsProcessorList" = None,
|
||||
):
|
||||
@ -1170,7 +1170,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
|
||||
self.assistant_early_exit = self.generation_config.assistant_early_exit
|
||||
self.generation_config.assistant_early_exit = None
|
||||
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
|
||||
# Temporarily sets the number of hidden layers to the early exit value
|
||||
base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
|
||||
original_num_hidden_layers = base_model.config.num_hidden_layers
|
||||
@ -1221,7 +1221,7 @@ def _crop_past_key_values(model, past_key_values, max_length):
|
||||
return past_key_values
|
||||
|
||||
|
||||
def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_encoder_decoder: bool) -> Dict[str, Any]:
|
||||
def _prepare_attention_mask(model_kwargs: dict[str, Any], new_length: int, is_encoder_decoder: bool) -> dict[str, Any]:
|
||||
"""Expands or crops the model's mask for decoding purposes, to the defined length"""
|
||||
|
||||
mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
|
||||
@ -1257,7 +1257,7 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
|
||||
return model_kwargs
|
||||
|
||||
|
||||
def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
|
||||
def _prepare_token_type_ids(model_kwargs: dict[str, Any], new_length: int) -> dict[str, Any]:
|
||||
"""Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
|
||||
if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
|
||||
return model_kwargs
|
||||
|
@ -20,7 +20,7 @@ import os
|
||||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, is_dataclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
||||
|
||||
from .. import __version__
|
||||
from ..configuration_utils import PretrainedConfig
|
||||
@ -149,7 +149,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
max_time (`float`, *optional*):
|
||||
The maximum amount of time you allow the computation to run for in seconds. generation will still finish
|
||||
the current pass after allocated time has been passed.
|
||||
stop_strings (`str or List[str]`, *optional*):
|
||||
stop_strings (`str or list[str]`, *optional*):
|
||||
A string or a list of strings that should terminate generation if the model outputs them.
|
||||
|
||||
> Parameters that control the generation strategy used
|
||||
@ -163,7 +163,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
[this paper](https://huggingface.co/papers/1610.02424) for more details.
|
||||
penalty_alpha (`float`, *optional*):
|
||||
The values balance the model confidence and the degeneration penalty in contrastive search decoding.
|
||||
dola_layers (`str` or `List[int]`, *optional*):
|
||||
dola_layers (`str` or `list[int]`, *optional*):
|
||||
The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
|
||||
be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
|
||||
"low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
|
||||
@ -245,26 +245,26 @@ class GenerationConfig(PushToHubMixin):
|
||||
`length_penalty` < 0.0 encourages shorter sequences.
|
||||
no_repeat_ngram_size (`int`, *optional*, defaults to 0):
|
||||
If set to int > 0, all ngrams of that size can only occur once.
|
||||
bad_words_ids (`List[List[int]]`, *optional*):
|
||||
bad_words_ids (`list[list[int]]`, *optional*):
|
||||
List of list of token ids that are not allowed to be generated. Check
|
||||
[`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
|
||||
force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*):
|
||||
List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
|
||||
words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
|
||||
force_words_ids (`list[list[int]]` or `list[list[list[int]]]`, *optional*):
|
||||
List of token ids that must be generated. If given a `list[list[int]]`, this is treated as a simple list of
|
||||
words that must be included, the opposite to `bad_words_ids`. If given `list[list[list[int]]]`, this
|
||||
triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
|
||||
can allow different forms of each word.
|
||||
renormalize_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether to renormalize the logits after applying all the logits processors (including the custom
|
||||
ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
|
||||
are normalized but some logit processors break the normalization.
|
||||
constraints (`List[Constraint]`, *optional*):
|
||||
constraints (`list[Constraint]`, *optional*):
|
||||
Custom constraints that can be added to the generation to ensure that the output will contain the use of
|
||||
certain tokens as defined by `Constraint` objects, in the most sensible way possible.
|
||||
forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
|
||||
The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
|
||||
multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
|
||||
language token.
|
||||
forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
|
||||
forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
|
||||
The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
|
||||
list to set multiple *end-of-sequence* tokens.
|
||||
remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
|
||||
@ -274,13 +274,13 @@ class GenerationConfig(PushToHubMixin):
|
||||
This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
|
||||
generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
|
||||
penalty starts and `decay_factor` represents the factor of exponential decay
|
||||
suppress_tokens (`List[int]`, *optional*):
|
||||
suppress_tokens (`list[int]`, *optional*):
|
||||
A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
|
||||
log probs to `-inf` so that they are not sampled.
|
||||
begin_suppress_tokens (`List[int]`, *optional*):
|
||||
begin_suppress_tokens (`list[int]`, *optional*):
|
||||
A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit
|
||||
processor will set their log probs to `-inf` so that they are not sampled.
|
||||
sequence_bias (`Dict[Tuple[int], float]`, *optional*)):
|
||||
sequence_bias (`dict[tuple[int], float]`, *optional*)):
|
||||
Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
|
||||
sequence being selected, while negative biases do the opposite. Check
|
||||
[`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
|
||||
@ -325,7 +325,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
The id of the *padding* token.
|
||||
bos_token_id (`int`, *optional*):
|
||||
The id of the *beginning-of-sequence* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
|
||||
> Generation parameters exclusive to encoder-decoder models
|
||||
@ -333,7 +333,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
|
||||
If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
|
||||
`decoder_input_ids`.
|
||||
decoder_start_token_id (`int` or `List[int]`, *optional*):
|
||||
decoder_start_token_id (`int` or `list[int]`, *optional*):
|
||||
If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
|
||||
`batch_size`. Indicating a list enables different start ids for each element in the batch
|
||||
(e.g. multilingual models with different target languages in one batch)
|
||||
@ -846,7 +846,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
|
||||
@ -933,7 +933,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
@ -959,7 +959,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
|
||||
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the `return_unused_kwargs` keyword parameter.
|
||||
@ -1090,14 +1090,14 @@ class GenerationConfig(PushToHubMixin):
|
||||
return json.loads(text)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
|
||||
def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "GenerationConfig":
|
||||
"""
|
||||
Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
config_dict (`Dict[str, Any]`):
|
||||
config_dict (`dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the configuration object.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Additional parameters from which to initialize the configuration object.
|
||||
|
||||
Returns:
|
||||
@ -1123,7 +1123,7 @@ class GenerationConfig(PushToHubMixin):
|
||||
else:
|
||||
return config
|
||||
|
||||
def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
|
||||
def dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
|
||||
"""
|
||||
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
|
||||
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
|
||||
@ -1135,13 +1135,13 @@ class GenerationConfig(PushToHubMixin):
|
||||
if isinstance(value, dict):
|
||||
self.dict_torch_dtype_to_str(value)
|
||||
|
||||
def to_diff_dict(self) -> Dict[str, Any]:
|
||||
def to_diff_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Removes all attributes from config which correspond to the default config attributes for better readability and
|
||||
serializes to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
|
||||
@ -1158,12 +1158,12 @@ class GenerationConfig(PushToHubMixin):
|
||||
self.dict_torch_dtype_to_str(serializable_config_dict)
|
||||
return serializable_config_dict
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
|
||||
@ -1289,11 +1289,11 @@ class GenerationConfig(PushToHubMixin):
|
||||
returning all the unused kwargs.
|
||||
|
||||
Args:
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Dictionary of attributes to tentatively update this class.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
|
||||
`dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
|
||||
"""
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
@ -1319,7 +1319,7 @@ class BaseWatermarkingConfig(ABC):
|
||||
Constructs a BaseWatermarkingConfig instance from a dictionary of parameters.
|
||||
|
||||
Args:
|
||||
config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
|
||||
config_dict (dict[str, Any]): Dictionary containing configuration parameters.
|
||||
**kwargs: Additional keyword arguments to override dictionary values.
|
||||
|
||||
Returns:
|
||||
@ -1348,12 +1348,12 @@ class BaseWatermarkingConfig(ABC):
|
||||
|
||||
writer.write(json_string)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
|
||||
dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
@ -1479,7 +1479,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
|
||||
Args:
|
||||
ngram_len (`int`):
|
||||
Ngram length.
|
||||
keys (`List[int]`):
|
||||
keys (`list[int]`):
|
||||
A sequence of watermarking keys, one for each depth.
|
||||
context_history_size (`int`, *optional*, defaults to 1024):
|
||||
Size of the tensor to keep track of seen contexts.
|
||||
@ -1518,7 +1518,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
|
||||
def __init__(
|
||||
self,
|
||||
ngram_len: int,
|
||||
keys: List[int],
|
||||
keys: list[int],
|
||||
context_history_size: int = 1024,
|
||||
sampling_table_seed: int = 0,
|
||||
sampling_table_size: int = 2**16,
|
||||
@ -1605,6 +1605,6 @@ class CompileConfig:
|
||||
# Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
|
||||
_compile_all_devices = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})
|
||||
|
@ -23,7 +23,7 @@ from collections import deque
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from functools import partial
|
||||
from typing import Deque, Dict, List, Optional, Set, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -59,16 +59,16 @@ class GenerationOutput:
|
||||
|
||||
Attributes:
|
||||
request_id (str): The ID of the generation request.
|
||||
prompt_ids (List[int]): The IDs of the prompt tokens.
|
||||
generated_tokens (List[int]): The generated tokens.
|
||||
logprobs (List[float]): The log probabilities of the generated tokens.
|
||||
prompt_ids (list[int]): The IDs of the prompt tokens.
|
||||
generated_tokens (list[int]): The generated tokens.
|
||||
logprobs (list[float]): The log probabilities of the generated tokens.
|
||||
error (Optional[str]): Any error message associated with the request. When None, the request was successful.
|
||||
"""
|
||||
|
||||
request_id: str
|
||||
prompt_ids: List[int] = field(default_factory=list)
|
||||
generated_tokens: List[int] = field(default_factory=list)
|
||||
logprobs: List[float] = field(default_factory=list)
|
||||
prompt_ids: list[int] = field(default_factory=list)
|
||||
generated_tokens: list[int] = field(default_factory=list)
|
||||
logprobs: list[float] = field(default_factory=list)
|
||||
error: Optional[str] = None
|
||||
status: RequestStatus = RequestStatus.PENDING
|
||||
created_time: float = field(default_factory=time.time)
|
||||
@ -85,11 +85,11 @@ class RequestState:
|
||||
|
||||
# Required fields
|
||||
request_id: str
|
||||
prompt_ids: Optional[List[int]] = None # the one being processed
|
||||
full_prompt_ids: Optional[List[int]] = None # the full prompt
|
||||
remaining_prompt_ids: List[int] = field(default_factory=list) # For split requests
|
||||
static_outputs: List[int] = field(default_factory=list)
|
||||
allocated_blocks: List[int] = field(default_factory=list)
|
||||
prompt_ids: Optional[list[int]] = None # the one being processed
|
||||
full_prompt_ids: Optional[list[int]] = None # the full prompt
|
||||
remaining_prompt_ids: list[int] = field(default_factory=list) # For split requests
|
||||
static_outputs: list[int] = field(default_factory=list)
|
||||
allocated_blocks: list[int] = field(default_factory=list)
|
||||
position_offset: int = 0 # Current position in the sequence for position_ids
|
||||
status: RequestStatus = RequestStatus.PENDING
|
||||
max_new_tokens: int = 20
|
||||
@ -150,8 +150,8 @@ class PagedAttentionCache(Cache):
|
||||
generation_config: GenerationConfig,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype = torch.float16,
|
||||
layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
|
||||
initial_prompt_shapes: Optional[List[List[int]]] = None,
|
||||
layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
|
||||
initial_prompt_shapes: Optional[list[list[int]]] = None,
|
||||
) -> None:
|
||||
"""Initialize a paged attention cache for efficient memory usage.
|
||||
|
||||
@ -191,8 +191,8 @@ class PagedAttentionCache(Cache):
|
||||
self.dtype = dtype
|
||||
self.device = device
|
||||
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
self.value_cache: List[torch.Tensor] = []
|
||||
self.key_cache: list[torch.Tensor] = []
|
||||
self.value_cache: list[torch.Tensor] = []
|
||||
for idx in range(config.num_hidden_layers):
|
||||
layer_device = layer_device_map[idx] if layer_device_map is not None else device
|
||||
new_layer_key_cache = torch.zeros(self.cache_shape, dtype=self.dtype, device=layer_device)
|
||||
@ -206,10 +206,10 @@ class PagedAttentionCache(Cache):
|
||||
|
||||
# Block management data structures
|
||||
self._free_blocks = deque(range(num_blocks))
|
||||
self._block_tables: Dict[str, List[int]] = {}
|
||||
self._block_tables: dict[str, list[int]] = {}
|
||||
|
||||
@traced
|
||||
def allocate_blocks(self, n_blocks: int, request_id: str) -> List[int]:
|
||||
def allocate_blocks(self, n_blocks: int, request_id: str) -> list[int]:
|
||||
"""Allocates n_blocks for a given request_id."""
|
||||
if len(self._free_blocks) < n_blocks:
|
||||
return False
|
||||
@ -236,12 +236,12 @@ class PagedAttentionCache(Cache):
|
||||
"""Returns the number of free blocks available."""
|
||||
return len(self._free_blocks)
|
||||
|
||||
def get_block_table(self, request_id: str) -> List[int]:
|
||||
def get_block_table(self, request_id: str) -> list[int]:
|
||||
"""Returns the block table for a request."""
|
||||
return self._block_tables.get(request_id, [])
|
||||
|
||||
@traced
|
||||
def _get_physical_indices(self, state: RequestState, logical_indices: List[int]) -> List[int]:
|
||||
def _get_physical_indices(self, state: RequestState, logical_indices: list[int]) -> list[int]:
|
||||
"""
|
||||
Maps logical sequence indices to physical cache indices using the block table, using PyTorch.
|
||||
|
||||
@ -289,7 +289,7 @@ class PagedAttentionCache(Cache):
|
||||
read_index,
|
||||
write_index,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Reshape cache for easier indexing
|
||||
total_slots = self.num_blocks * self.block_size
|
||||
k_cache_flat = self.key_cache[layer_idx].view(self.num_key_value_heads, total_slots, self.head_dim)
|
||||
@ -306,9 +306,9 @@ class Scheduler(ABC):
|
||||
"""
|
||||
|
||||
def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False):
|
||||
self.active_requests: Dict[str, RequestState] = {}
|
||||
self.waiting_requests: Dict[str, RequestState] = {}
|
||||
self.waiting_requests_order: Deque[str] = deque()
|
||||
self.active_requests: dict[str, RequestState] = {}
|
||||
self.waiting_requests: dict[str, RequestState] = {}
|
||||
self.waiting_requests_order: deque[str] = deque()
|
||||
self.cache = cache
|
||||
self.retain_cache_on_finish = retain_cache_on_finish
|
||||
|
||||
@ -318,7 +318,7 @@ class Scheduler(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def schedule_batch(self, token_budget: int) -> List[RequestState]:
|
||||
def schedule_batch(self, token_budget: int) -> list[RequestState]:
|
||||
pass
|
||||
|
||||
@traced
|
||||
@ -332,7 +332,7 @@ class Scheduler(ABC):
|
||||
pass
|
||||
|
||||
@traced
|
||||
def get_active_request_static_outputs(self, request_id: str) -> List[int]:
|
||||
def get_active_request_static_outputs(self, request_id: str) -> list[int]:
|
||||
if request_id in self.active_requests:
|
||||
return self.active_requests[request_id].static_outputs
|
||||
return []
|
||||
@ -356,7 +356,7 @@ class FIFOScheduler(Scheduler):
|
||||
|
||||
@traced(span_name="prepare_request")
|
||||
def _prepare_request_for_processing(
|
||||
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str]
|
||||
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
|
||||
):
|
||||
"""Prepare a request for processing in the current batch."""
|
||||
request_tokens = (
|
||||
@ -395,9 +395,9 @@ class FIFOScheduler(Scheduler):
|
||||
self.waiting_requests_order.append(state.request_id)
|
||||
|
||||
@traced
|
||||
def schedule_batch(self, token_budget: int) -> List[RequestState]:
|
||||
priority_states: List[RequestState] = []
|
||||
second_priority_states: List[RequestState] = []
|
||||
def schedule_batch(self, token_budget: int) -> list[RequestState]:
|
||||
priority_states: list[RequestState] = []
|
||||
second_priority_states: list[RequestState] = []
|
||||
scheduled_requests = []
|
||||
|
||||
for state in self.active_requests.values():
|
||||
@ -475,7 +475,7 @@ class PrefillFirstScheduler(Scheduler):
|
||||
|
||||
@traced(span_name="prepare_request")
|
||||
def _prepare_request_for_processing(
|
||||
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str]
|
||||
self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
|
||||
):
|
||||
"""Prepare a request for processing in the current batch."""
|
||||
request_tokens = (
|
||||
@ -514,9 +514,9 @@ class PrefillFirstScheduler(Scheduler):
|
||||
self.waiting_requests_order.append(state.request_id)
|
||||
|
||||
@traced
|
||||
def schedule_batch(self, token_budget: int) -> List[RequestState]:
|
||||
priority_states: List[RequestState] = []
|
||||
second_priority_states: List[RequestState] = []
|
||||
def schedule_batch(self, token_budget: int) -> list[RequestState]:
|
||||
priority_states: list[RequestState] = []
|
||||
second_priority_states: list[RequestState] = []
|
||||
scheduled_requests = []
|
||||
|
||||
for state in self.active_requests.values():
|
||||
@ -581,7 +581,7 @@ def compute_optimal_blocks(
|
||||
device: torch.device,
|
||||
config: PretrainedConfig,
|
||||
generation_config: GenerationConfig,
|
||||
inputs: List[List[int]],
|
||||
inputs: list[list[int]],
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
safety_margin: float = 0.9,
|
||||
median_prefill_length: Optional[int] = None,
|
||||
@ -678,7 +678,7 @@ class PagedAttentionArgs:
|
||||
write_index: torch.Tensor
|
||||
read_index: torch.Tensor
|
||||
logits_indices: torch.Tensor
|
||||
block_tables: Dict[str, List[int]]
|
||||
block_tables: dict[str, list[int]]
|
||||
cache: PagedAttentionCache
|
||||
use_cache: bool = False
|
||||
|
||||
@ -754,7 +754,7 @@ class ContinuousBatchProcessor:
|
||||
self.streaming = streaming
|
||||
self.manual_eviction = manual_eviction
|
||||
|
||||
self.requests_in_batch: List[RequestState] = []
|
||||
self.requests_in_batch: list[RequestState] = []
|
||||
|
||||
# Get batch size parameters from generation config
|
||||
self._configure_batch_parameters()
|
||||
@ -1152,7 +1152,7 @@ class ContinuousBatchingManager:
|
||||
self._generation_thread = None
|
||||
|
||||
def add_request(
|
||||
self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
|
||||
self, input_ids: list[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
"""Add a new generation request to the queue.
|
||||
|
||||
@ -1184,7 +1184,7 @@ class ContinuousBatchingManager:
|
||||
logger.debug(f"Added request {request_id} to queue.")
|
||||
return request_id
|
||||
|
||||
def add_requests(self, inputs: List[List[int]], **kwargs):
|
||||
def add_requests(self, inputs: list[list[int]], **kwargs):
|
||||
for i, input_ids in enumerate(inputs):
|
||||
# Assign a predictable request ID for ordering results later
|
||||
req_id = f"batch_req_{i}"
|
||||
@ -1428,11 +1428,11 @@ class ContinuousMixin:
|
||||
@torch.inference_mode()
|
||||
def generate_batch(
|
||||
self,
|
||||
inputs: List[List[int]],
|
||||
inputs: list[list[int]],
|
||||
generation_config: Optional[GenerationConfig] = None,
|
||||
progress_bar: bool = True,
|
||||
**kwargs,
|
||||
) -> List[List[int]]:
|
||||
) -> list[list[int]]:
|
||||
"""Generate sequences for a batch of prompts using continuous batching.
|
||||
|
||||
Args:
|
||||
@ -1441,7 +1441,7 @@ class ContinuousMixin:
|
||||
**kwargs: Additional generation parameters
|
||||
|
||||
Returns:
|
||||
`List[List[int]]`: A list containing the generated sequences (including prompt tokens
|
||||
`list[list[int]]`: A list containing the generated sequences (including prompt tokens
|
||||
if not handled otherwise) for each input prompt, in the same order.
|
||||
Returns an empty list `[]` for requests that failed.
|
||||
"""
|
||||
|
@ -39,7 +39,7 @@ LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
|
||||
scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
|
||||
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
|
||||
search or log softmax for each vocabulary token when using beam search
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional logits processor specific kwargs.
|
||||
|
||||
Return:
|
||||
@ -276,7 +276,7 @@ class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
|
||||
beginning of the generation.
|
||||
|
||||
Args:
|
||||
begin_suppress_tokens (`List[int]`):
|
||||
begin_suppress_tokens (`list[int]`):
|
||||
Tokens to not sample.
|
||||
begin_index (`int`):
|
||||
Index where the tokens are suppressed.
|
||||
|
@ -19,7 +19,7 @@ import copy
|
||||
import inspect
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import Any, Dict, Optional, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import flax
|
||||
import jax
|
||||
@ -103,7 +103,7 @@ class GreedyState:
|
||||
sequences: jnp.ndarray
|
||||
running_token: jnp.ndarray
|
||||
is_sent_finished: jnp.ndarray
|
||||
model_kwargs: Dict[str, jnp.ndarray]
|
||||
model_kwargs: dict[str, jnp.ndarray]
|
||||
|
||||
|
||||
@flax.struct.dataclass
|
||||
@ -113,7 +113,7 @@ class SampleState:
|
||||
running_token: jnp.ndarray
|
||||
is_sent_finished: jnp.ndarray
|
||||
prng_key: jnp.ndarray
|
||||
model_kwargs: Dict[str, jnp.ndarray]
|
||||
model_kwargs: dict[str, jnp.ndarray]
|
||||
|
||||
|
||||
@flax.struct.dataclass
|
||||
@ -124,7 +124,7 @@ class BeamSearchState:
|
||||
sequences: jnp.ndarray
|
||||
scores: jnp.ndarray
|
||||
is_sent_finished: jnp.ndarray
|
||||
model_kwargs: Dict[str, jnp.ndarray]
|
||||
model_kwargs: dict[str, jnp.ndarray]
|
||||
|
||||
|
||||
class FlaxGenerationMixin:
|
||||
@ -173,7 +173,7 @@ class FlaxGenerationMixin:
|
||||
batch_size: int,
|
||||
decoder_start_token_id: Optional[int] = None,
|
||||
bos_token_id: Optional[int] = None,
|
||||
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
|
||||
) -> jnp.ndarray:
|
||||
if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
|
||||
# Only use this arg if not None, otherwise just remove from model_kwargs
|
||||
@ -249,7 +249,7 @@ class FlaxGenerationMixin:
|
||||
exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
|
||||
raise TypeError(exception_message)
|
||||
|
||||
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
|
||||
def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
|
||||
"""Validates model kwargs for generation. Generate argument typos will also be caught here."""
|
||||
unused_model_args = []
|
||||
model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
|
||||
@ -273,7 +273,7 @@ class FlaxGenerationMixin:
|
||||
generation_config: Optional[GenerationConfig] = None,
|
||||
prng_key: Optional[jnp.ndarray] = None,
|
||||
trace: bool = True,
|
||||
params: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
params: Optional[dict[str, jnp.ndarray]] = None,
|
||||
logits_processor: Optional[FlaxLogitsProcessorList] = None,
|
||||
**kwargs,
|
||||
):
|
||||
@ -293,13 +293,13 @@ class FlaxGenerationMixin:
|
||||
trace (`bool`, *optional*, defaults to `True`):
|
||||
Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
|
||||
considerably slower runtime.
|
||||
params (`Dict[str, jnp.ndarray]`, *optional*):
|
||||
params (`dict[str, jnp.ndarray]`, *optional*):
|
||||
Optionally the model parameters can be passed. Can be useful for parallelized generation.
|
||||
logits_processor (`FlaxLogitsProcessorList `, *optional*):
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
||||
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
|
||||
@ -580,8 +580,8 @@ class FlaxGenerationMixin:
|
||||
eos_token_id: Optional[int] = None,
|
||||
logits_processor: Optional[FlaxLogitsProcessorList] = None,
|
||||
trace: bool = True,
|
||||
params: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
params: Optional[dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
|
||||
):
|
||||
# init values
|
||||
max_length = max_length if max_length is not None else self.generation_config.max_length
|
||||
@ -668,8 +668,8 @@ class FlaxGenerationMixin:
|
||||
logits_processor: Optional[FlaxLogitsProcessorList] = None,
|
||||
logits_warper: Optional[FlaxLogitsProcessorList] = None,
|
||||
trace: bool = True,
|
||||
params: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
params: Optional[dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
|
||||
):
|
||||
# init values
|
||||
max_length = max_length if max_length is not None else self.generation_config.max_length
|
||||
@ -765,9 +765,9 @@ class FlaxGenerationMixin:
|
||||
early_stopping: Optional[Union[bool, str]] = None,
|
||||
logits_processor: Optional[FlaxLogitsProcessorList] = None,
|
||||
trace: bool = True,
|
||||
params: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
params: Optional[dict[str, jnp.ndarray]] = None,
|
||||
num_return_sequences: Optional[int] = None,
|
||||
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
|
||||
model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
|
||||
):
|
||||
"""
|
||||
This beam search function is heavily inspired by Flax's official example:
|
||||
|
@ -16,7 +16,7 @@
|
||||
import inspect
|
||||
import math
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Callable, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -72,7 +72,7 @@ class LogitsProcessorList(list):
|
||||
scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
|
||||
Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
|
||||
beam search or log softmax for each vocabulary token when using beam search
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional kwargs that are specific to a logits processor.
|
||||
|
||||
Return:
|
||||
@ -103,7 +103,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
|
||||
Args:
|
||||
min_length (`int`):
|
||||
The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
device (`str`, *optional*, defaults to `"cpu"`):
|
||||
The device to allocate the tensors.
|
||||
@ -134,7 +134,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, min_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
|
||||
def __init__(self, min_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
|
||||
if not isinstance(min_length, int) or min_length < 0:
|
||||
raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
|
||||
|
||||
@ -167,7 +167,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
|
||||
input length.
|
||||
min_new_tokens (`int`):
|
||||
The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
device (`str`, *optional*, defaults to `"cpu"`):
|
||||
The device to allocate the tensors.
|
||||
@ -197,7 +197,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
|
||||
self,
|
||||
prompt_length_to_skip: int,
|
||||
min_new_tokens: int,
|
||||
eos_token_id: Union[int, List[int], torch.Tensor],
|
||||
eos_token_id: Union[int, list[int], torch.Tensor],
|
||||
device: str = "cpu",
|
||||
):
|
||||
for arg_name, arg_value in [
|
||||
@ -917,7 +917,7 @@ def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
|
||||
|
||||
def _calc_banned_ngram_tokens(
|
||||
ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int
|
||||
) -> List[Iterable[int]]:
|
||||
) -> list[Iterable[int]]:
|
||||
"""Copied from fairseq for no_repeat_ngram in beam_search"""
|
||||
if cur_len + 1 < ngram_size:
|
||||
# return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
|
||||
@ -1074,7 +1074,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
sequence_bias (`List[List[Union[List[int], float]]]`):
|
||||
sequence_bias (`list[list[Union[list[int], float]]]`):
|
||||
List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0],
|
||||
[[64], -7.5]]`). Positive biases increase the odds of the
|
||||
sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
|
||||
@ -1123,7 +1123,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, sequence_bias: List[List[Union[List[int], float]]]):
|
||||
def __init__(self, sequence_bias: list[list[Union[list[int], float]]]):
|
||||
self.sequence_bias = sequence_bias
|
||||
self._validate_arguments()
|
||||
self._convert_list_arguments_into_dict()
|
||||
@ -1250,9 +1250,9 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
bad_words_ids (`List[List[int]]`):
|
||||
bad_words_ids (`list[list[int]]`):
|
||||
List of list of token ids that are not allowed to be generated.
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`, *optional*):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
|
||||
Examples:
|
||||
@ -1291,7 +1291,7 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, bad_words_ids: List[List[int]], eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None
|
||||
self, bad_words_ids: list[list[int]], eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None
|
||||
):
|
||||
self.bad_word_ids = bad_words_ids
|
||||
self._validate_arguments()
|
||||
@ -1332,7 +1332,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
||||
generation. See [Autoregressive Entity Retrieval](https://huggingface.co/papers/2010.00904) for more information.
|
||||
|
||||
Args:
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`):
|
||||
This function constraints the beam search to allowed tokens only at each step. This function takes 2
|
||||
arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
|
||||
next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
|
||||
@ -1373,7 +1373,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
|
||||
def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]], num_beams: int):
|
||||
self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
|
||||
self._num_beams = num_beams
|
||||
|
||||
@ -1586,7 +1586,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
|
||||
Args:
|
||||
max_length (`int`):
|
||||
The maximum length of the sequence to be generated.
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
device (`str`, *optional*, defaults to `"cpu"`):
|
||||
The device to allocate the tensors.
|
||||
@ -1613,7 +1613,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, max_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
|
||||
def __init__(self, max_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
|
||||
self.max_length = max_length
|
||||
|
||||
if not isinstance(eos_token_id, torch.Tensor):
|
||||
@ -1666,7 +1666,7 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
|
||||
exponential_decay_length_penalty (`tuple(int, float)`):
|
||||
This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty
|
||||
starts and `decay_factor` represents the factor of exponential decay
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
input_ids_seq_length (`int`):
|
||||
The length of the input sequence.
|
||||
@ -1726,8 +1726,8 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
exponential_decay_length_penalty: Tuple[int, float],
|
||||
eos_token_id: Union[int, List[int], torch.Tensor],
|
||||
exponential_decay_length_penalty: tuple[int, float],
|
||||
eos_token_id: Union[int, list[int], torch.Tensor],
|
||||
input_ids_seq_length: int,
|
||||
):
|
||||
self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
|
||||
@ -2326,13 +2326,13 @@ class BarkEosPrioritizerLogitsProcessor(LogitsProcessor):
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
min_eos_p (`float`, *optional*):
|
||||
Minimum end of speech threshold.
|
||||
"""
|
||||
|
||||
def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
|
||||
def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
|
||||
if not isinstance(eos_token_id, torch.Tensor):
|
||||
if isinstance(eos_token_id, int):
|
||||
eos_token_id = [eos_token_id]
|
||||
@ -2569,7 +2569,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
||||
Args:
|
||||
ngram_len (`int`):
|
||||
Ngram length.
|
||||
keys (`List[int]`):
|
||||
keys (`list[int]`):
|
||||
A sequence of watermarking keys, one for each depth.
|
||||
sampling_table_size (`int`):
|
||||
Size of the sampling table.
|
||||
@ -2610,7 +2610,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
ngram_len: int,
|
||||
keys: List[int],
|
||||
keys: list[int],
|
||||
sampling_table_size: int,
|
||||
sampling_table_seed: int,
|
||||
context_history_size: int,
|
||||
@ -2808,7 +2808,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
|
||||
|
||||
def _compute_keys(
|
||||
self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor
|
||||
) -> Tuple[torch.LongTensor, torch.LongTensor]:
|
||||
) -> tuple[torch.LongTensor, torch.LongTensor]:
|
||||
"""Computes random keys for each ngram and depth.
|
||||
|
||||
Args:
|
||||
|
@ -3,7 +3,7 @@ import warnings
|
||||
from abc import ABC
|
||||
from collections import OrderedDict
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -33,7 +33,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
|
||||
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
|
||||
or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
|
||||
make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional stopping criteria specific kwargs.
|
||||
|
||||
Return:
|
||||
@ -209,7 +209,7 @@ class StopStringCriteria(StoppingCriteria):
|
||||
Args:
|
||||
tokenizer (`PreTrainedTokenizer`):
|
||||
The model's associated tokenizer (necessary to extract vocab and tokenize the termination sequences)
|
||||
stop_strings (`Union[str, List[str]]`):
|
||||
stop_strings (`Union[str, list[str]]`):
|
||||
A list of strings that should end generation. If a string is passed, it will be treated like a
|
||||
list with a single element.
|
||||
|
||||
@ -239,10 +239,10 @@ class StopStringCriteria(StoppingCriteria):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, List[str]]):
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, list[str]]):
|
||||
if isinstance(stop_strings, str):
|
||||
stop_strings = [stop_strings]
|
||||
self.stop_strings: Tuple[str, ...] = tuple(stop_strings)
|
||||
self.stop_strings: tuple[str, ...] = tuple(stop_strings)
|
||||
vocab = tokenizer.get_vocab()
|
||||
token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
|
||||
self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
|
||||
@ -298,7 +298,7 @@ class StopStringCriteria(StoppingCriteria):
|
||||
@staticmethod
|
||||
def _stop_string_get_matching_positions(
|
||||
token_list, token_indices, stop_strings
|
||||
) -> Tuple[Dict[str, Dict[str, List[int]]], Dict[str, Dict[str, List[int]]]]:
|
||||
) -> tuple[dict[str, dict[str, list[int]]], dict[str, dict[str, list[int]]]]:
|
||||
"""This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can
|
||||
validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the
|
||||
token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters
|
||||
@ -337,7 +337,7 @@ class StopStringCriteria(StoppingCriteria):
|
||||
return token_valid_positions, token_end_overlaps
|
||||
|
||||
@staticmethod
|
||||
def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> Dict[str, torch.tensor]:
|
||||
def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> dict[str, torch.tensor]:
|
||||
"""This function precomputes everything needed for the run-time checks in StopStringCriteria, and packs
|
||||
them into an embedding tensor that can be accessed with pure tensor operations. For the specifics of the values
|
||||
that are precomputed and what they are used for, please refer to the StopStringCriteria docstring!"""
|
||||
@ -455,11 +455,11 @@ class EosTokenCriteria(StoppingCriteria):
|
||||
By default, it uses the `model.generation_config.eos_token_id`.
|
||||
|
||||
Args:
|
||||
eos_token_id (`Union[int, List[int], torch.Tensor]`):
|
||||
eos_token_id (`Union[int, list[int], torch.Tensor]`):
|
||||
The id(s) of the *end-of-sequence* token.
|
||||
"""
|
||||
|
||||
def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor]):
|
||||
def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor]):
|
||||
if not isinstance(eos_token_id, torch.Tensor):
|
||||
if isinstance(eos_token_id, int):
|
||||
eos_token_id = [eos_token_id]
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@ -42,7 +41,7 @@ TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
|
||||
cur_len (`int`):
|
||||
The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
|
||||
is the maximum length generate can produce, and we need to know which of its tokens are valid.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional logits processor specific kwargs.
|
||||
|
||||
Return:
|
||||
@ -290,7 +289,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
|
||||
[`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
|
||||
|
||||
Args:
|
||||
bad_words_ids (`List[List[int]]`):
|
||||
bad_words_ids (`list[list[int]]`):
|
||||
List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
|
||||
that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
|
||||
the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
|
||||
@ -300,8 +299,8 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
|
||||
The id of the *end-of-sequence* token.
|
||||
"""
|
||||
|
||||
def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
|
||||
if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
|
||||
def __init__(self, bad_words_ids: list[list[int]], eos_token_id: int):
|
||||
if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
|
||||
raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
|
||||
if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
|
||||
raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
|
||||
@ -370,7 +369,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
|
||||
# To remain simple and XLA-compatible, we work on a per-row fashion.
|
||||
# TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
|
||||
# a frequent choke point. (make `cur_len` a tensor?)
|
||||
def _get_row_updated_score(row_inputs: Tuple[tf.Tensor]) -> tf.Tensor:
|
||||
def _get_row_updated_score(row_inputs: tuple[tf.Tensor]) -> tf.Tensor:
|
||||
row_input_ids, row_score = row_inputs
|
||||
banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
|
||||
banned_tokens_mask = tf.scatter_nd(
|
||||
@ -565,7 +564,7 @@ class TFForceTokensLogitsProcessor(TFLogitsProcessor):
|
||||
indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
|
||||
`-inf` so that they are sampled at their corresponding index."""
|
||||
|
||||
def __init__(self, force_token_map: List[List[int]]):
|
||||
def __init__(self, force_token_map: list[list[int]]):
|
||||
force_token_map = dict(force_token_map)
|
||||
# Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
|
||||
# index of the array corresponds to the index of the token to be forced, for XLA compatibility.
|
||||
|
@ -18,7 +18,7 @@ import copy
|
||||
import inspect
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@ -77,9 +77,9 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -116,12 +116,12 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -147,9 +147,9 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -186,12 +186,12 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -223,10 +223,10 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
sequences_scores: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
beam_indices: Optional[tf.Tensor] = None
|
||||
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -270,13 +270,13 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
sequences_scores: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
beam_indices: Optional[tf.Tensor] = None
|
||||
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
encoder_attentions: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -308,10 +308,10 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
sequences_scores: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
beam_indices: Optional[tf.Tensor] = None
|
||||
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -354,13 +354,13 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
sequences_scores: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
beam_indices: Optional[tf.Tensor] = None
|
||||
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
encoder_attentions: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -385,9 +385,9 @@ class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -423,12 +423,12 @@ class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: Optional[tf.Tensor] = None
|
||||
scores: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
|
||||
scores: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_attentions: Optional[tuple[tf.Tensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
|
||||
|
||||
|
||||
TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput]
|
||||
@ -477,7 +477,7 @@ class TFGenerationMixin:
|
||||
def compute_transition_scores(
|
||||
self,
|
||||
sequences: tf.Tensor,
|
||||
scores: Tuple[tf.Tensor],
|
||||
scores: tuple[tf.Tensor],
|
||||
beam_indices: Optional[tf.Tensor] = None,
|
||||
normalize_logits: bool = False,
|
||||
) -> tf.Tensor:
|
||||
@ -619,7 +619,7 @@ class TFGenerationMixin:
|
||||
exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
|
||||
raise TypeError(exception_message)
|
||||
|
||||
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
|
||||
def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
|
||||
"""Validates model kwargs for generation. Generate argument typos will also be caught here."""
|
||||
# Excludes arguments that are handled before calling any model function
|
||||
if self.config.is_encoder_decoder:
|
||||
@ -681,10 +681,10 @@ class TFGenerationMixin:
|
||||
Custom logits processors that complement the default logits processors built from arguments and
|
||||
generation config. If a logit processor is passed that is already created with the arguments or a
|
||||
generation config an error is thrown. This feature is intended for advanced users.
|
||||
seed (`List[int]`, *optional*):
|
||||
seed (`list[int]`, *optional*):
|
||||
Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
|
||||
`seed` argument from stateless functions in `tf.random`.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
||||
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
|
||||
@ -1044,7 +1044,7 @@ class TFGenerationMixin:
|
||||
|
||||
def _prepare_encoder_decoder_kwargs_for_generation(
|
||||
self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
# 1. get encoder and store encoder outputs
|
||||
encoder = self.get_encoder()
|
||||
|
||||
@ -1076,10 +1076,10 @@ class TFGenerationMixin:
|
||||
self,
|
||||
batch_size: int,
|
||||
model_input_name: str,
|
||||
model_kwargs: Dict[str, tf.Tensor],
|
||||
model_kwargs: dict[str, tf.Tensor],
|
||||
decoder_start_token_id: Optional[int] = None,
|
||||
bos_token_id: Optional[int] = None,
|
||||
) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
|
||||
) -> tuple[tf.Tensor, dict[str, tf.Tensor]]:
|
||||
"""Prepares `decoder_input_ids` for generation with encoder-decoder models"""
|
||||
# 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
|
||||
# we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
|
||||
@ -1138,7 +1138,7 @@ class TFGenerationMixin:
|
||||
input_ids: Optional[tf.Tensor] = None,
|
||||
expand_in_new_axis: bool = False,
|
||||
**model_kwargs,
|
||||
) -> Tuple[tf.Tensor, Dict[str, Any]]:
|
||||
) -> tuple[tf.Tensor, dict[str, Any]]:
|
||||
"""
|
||||
Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
|
||||
depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
|
||||
@ -1174,8 +1174,8 @@ class TFGenerationMixin:
|
||||
self,
|
||||
inputs: Optional[tf.Tensor] = None,
|
||||
bos_token_id: Optional[int] = None,
|
||||
model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
|
||||
) -> Tuple[tf.Tensor, Optional[str], Dict[str, tf.Tensor]]:
|
||||
model_kwargs: Optional[dict[str, tf.Tensor]] = None,
|
||||
) -> tuple[tf.Tensor, Optional[str], dict[str, tf.Tensor]]:
|
||||
"""
|
||||
This function extracts the model-specific `inputs` for generation.
|
||||
"""
|
||||
@ -1240,7 +1240,7 @@ class TFGenerationMixin:
|
||||
self,
|
||||
inputs: Optional[tf.Tensor] = None,
|
||||
bos_token_id: Optional[int] = None,
|
||||
model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
|
||||
model_kwargs: Optional[dict[str, tf.Tensor]] = None,
|
||||
) -> tf.Tensor:
|
||||
"""Initializes input ids for generation, if necessary."""
|
||||
if inputs is not None:
|
||||
@ -1276,8 +1276,8 @@ class TFGenerationMixin:
|
||||
return past_key_values
|
||||
|
||||
def _update_model_kwargs_for_generation(
|
||||
self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
self, outputs: ModelOutput, model_kwargs: dict[str, Any], is_encoder_decoder: bool = False
|
||||
) -> dict[str, Any]:
|
||||
# update past_key_values
|
||||
model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs)
|
||||
|
||||
@ -1294,7 +1294,7 @@ class TFGenerationMixin:
|
||||
def _update_model_kwargs_for_xla_generation(
|
||||
self,
|
||||
model_outputs: ModelOutput,
|
||||
model_kwargs: Dict[str, Any],
|
||||
model_kwargs: dict[str, Any],
|
||||
cur_len: int,
|
||||
max_length: int,
|
||||
batch_size: int,
|
||||
@ -1550,7 +1550,7 @@ class TFGenerationMixin:
|
||||
The maximum length of the sequence to be generated.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||
@ -1794,7 +1794,7 @@ class TFGenerationMixin:
|
||||
max_length: Optional[int] = None,
|
||||
pad_token_id: Optional[int] = None,
|
||||
eos_token_id: Optional[int] = None,
|
||||
seed: Optional[Tuple[int, int]] = None,
|
||||
seed: Optional[tuple[int, int]] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
output_scores: Optional[bool] = None,
|
||||
@ -1818,9 +1818,9 @@ class TFGenerationMixin:
|
||||
The maximum length of the sequence to be generated.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
seed (`List[int]`, *optional*):
|
||||
seed (`list[int]`, *optional*):
|
||||
Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
|
||||
`seed` argument from stateless functions in `tf.random`.
|
||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||
@ -2128,7 +2128,7 @@ class TFGenerationMixin:
|
||||
The maximum length of the sequence to be generated.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
length_penalty (`float`, *optional*, defaults to 1.0):
|
||||
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
|
||||
@ -2719,7 +2719,7 @@ class TFGenerationMixin:
|
||||
The maximum length of the sequence to be generated.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the *padding* token.
|
||||
eos_token_id (`Union[int, List[int]]`, *optional*):
|
||||
eos_token_id (`Union[int, list[int]]`, *optional*):
|
||||
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
|
||||
output_attentions (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||
|
@ -18,7 +18,7 @@ import inspect
|
||||
import os
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -169,11 +169,11 @@ class GenerateDecoderOnlyOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: torch.LongTensor
|
||||
scores: Optional[Tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
|
||||
scores: Optional[tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -214,14 +214,14 @@ class GenerateEncoderDecoderOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
sequences: torch.LongTensor
|
||||
scores: Optional[Tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
|
||||
scores: Optional[tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -260,12 +260,12 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
|
||||
|
||||
sequences: torch.LongTensor
|
||||
sequences_scores: Optional[torch.FloatTensor] = None
|
||||
scores: Optional[Tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[Tuple[torch.FloatTensor]] = None
|
||||
scores: Optional[tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[tuple[torch.FloatTensor]] = None
|
||||
beam_indices: Optional[torch.LongTensor] = None
|
||||
attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
|
||||
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -314,15 +314,15 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
|
||||
|
||||
sequences: torch.LongTensor
|
||||
sequences_scores: Optional[torch.FloatTensor] = None
|
||||
scores: Optional[Tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[Tuple[torch.FloatTensor]] = None
|
||||
scores: Optional[tuple[torch.FloatTensor]] = None
|
||||
logits: Optional[tuple[torch.FloatTensor]] = None
|
||||
beam_indices: Optional[torch.LongTensor] = None
|
||||
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
|
||||
encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
|
||||
encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
||||
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
|
||||
|
||||
|
||||
# TODO (joao): remove the equivalent classes and typing shortcuts below in v5
|
||||
@ -457,7 +457,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
input_ids: torch.LongTensor,
|
||||
inputs_embeds: Optional[torch.FloatTensor],
|
||||
cache_position: Optional[torch.LongTensor],
|
||||
) -> Tuple[torch.FloatTensor, torch.LongTensor]:
|
||||
) -> tuple[torch.FloatTensor, torch.LongTensor]:
|
||||
"""
|
||||
Generic cache-dependent input preparation
|
||||
The code is put in a separate function to allow granular unit testing
|
||||
@ -491,7 +491,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
input_ids: torch.LongTensor,
|
||||
inputs_embeds: Optional[torch.FloatTensor],
|
||||
cache_position: Optional[torch.LongTensor],
|
||||
) -> Tuple[torch.FloatTensor, torch.LongTensor]:
|
||||
) -> tuple[torch.FloatTensor, torch.LongTensor]:
|
||||
"""
|
||||
This method implements method ``_cache_dependant_input_preparation``
|
||||
with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
|
||||
@ -697,8 +697,8 @@ class GenerationMixin(ContinuousMixin):
|
||||
self,
|
||||
inputs: Optional[torch.Tensor] = None,
|
||||
bos_token_id: Optional[torch.Tensor] = None,
|
||||
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
||||
) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]:
|
||||
model_kwargs: Optional[dict[str, torch.Tensor]] = None,
|
||||
) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
|
||||
"""
|
||||
This function extracts the model-specific `inputs` for generation.
|
||||
"""
|
||||
@ -761,7 +761,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
self,
|
||||
inputs: Optional[torch.Tensor] = None,
|
||||
bos_token_id: Optional[torch.Tensor] = None,
|
||||
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
||||
model_kwargs: Optional[dict[str, torch.Tensor]] = None,
|
||||
) -> torch.LongTensor:
|
||||
"""Initializes input ids for generation, if necessary."""
|
||||
if inputs is not None:
|
||||
@ -793,7 +793,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
self,
|
||||
inputs_tensor: torch.Tensor,
|
||||
generation_config: GenerationConfig,
|
||||
model_kwargs: Dict[str, Any],
|
||||
model_kwargs: dict[str, Any],
|
||||
) -> torch.LongTensor:
|
||||
pad_token_id = generation_config._pad_token_tensor
|
||||
eos_token_id = generation_config._eos_token_tensor
|
||||
@ -831,7 +831,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
model_kwargs,
|
||||
model_input_name: Optional[str],
|
||||
generation_config: GenerationConfig,
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
# 1. get encoder
|
||||
encoder = self.get_encoder()
|
||||
# Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
|
||||
@ -870,10 +870,10 @@ class GenerationMixin(ContinuousMixin):
|
||||
self,
|
||||
batch_size: int,
|
||||
model_input_name: str,
|
||||
model_kwargs: Dict[str, torch.Tensor],
|
||||
model_kwargs: dict[str, torch.Tensor],
|
||||
decoder_start_token_id: torch.Tensor,
|
||||
device: Optional[torch.device] = None,
|
||||
) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
|
||||
) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
|
||||
"""Prepares `decoder_input_ids` for generation with encoder-decoder models"""
|
||||
# 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
|
||||
# we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
|
||||
@ -931,7 +931,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
is_encoder_decoder: bool = False,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
**model_kwargs,
|
||||
) -> Tuple[torch.LongTensor, Dict[str, Any]]:
|
||||
) -> tuple[torch.LongTensor, dict[str, Any]]:
|
||||
"""Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
|
||||
# Do not call torch.repeat_interleave if expand_size is 1 because it clones
|
||||
# the input tensor and thus requires more memory although no change is applied
|
||||
@ -963,10 +963,10 @@ class GenerationMixin(ContinuousMixin):
|
||||
def _update_model_kwargs_for_generation(
|
||||
self,
|
||||
outputs: ModelOutput,
|
||||
model_kwargs: Dict[str, Any],
|
||||
model_kwargs: dict[str, Any],
|
||||
is_encoder_decoder: bool = False,
|
||||
num_new_tokens: int = 1,
|
||||
) -> Dict[str, Any]:
|
||||
) -> dict[str, Any]:
|
||||
# update past_key_values keeping its naming used in model code
|
||||
for possible_cache_name in ALL_CACHE_NAMES:
|
||||
if possible_cache_name in outputs:
|
||||
@ -1024,7 +1024,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
logits_processor: LogitsProcessorList,
|
||||
target_tokenizer: "PreTrainedTokenizerBase",
|
||||
assistant_tokenizer: "PreTrainedTokenizerBase",
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
) -> CandidateGenerator:
|
||||
"""
|
||||
Returns the candidate generator to be used in `assisted_generation`
|
||||
@ -1100,10 +1100,10 @@ class GenerationMixin(ContinuousMixin):
|
||||
generation_config: GenerationConfig,
|
||||
input_ids_seq_length: Optional[int] = None,
|
||||
encoder_input_ids: torch.LongTensor = None,
|
||||
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
|
||||
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
device: Optional[str] = None,
|
||||
model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
negative_prompt_ids: Optional[torch.Tensor] = None,
|
||||
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
||||
) -> LogitsProcessorList:
|
||||
@ -1403,7 +1403,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
def compute_transition_scores(
|
||||
self,
|
||||
sequences: torch.Tensor,
|
||||
scores: Tuple[torch.Tensor],
|
||||
scores: tuple[torch.Tensor],
|
||||
beam_indices: Optional[torch.Tensor] = None,
|
||||
normalize_logits: bool = False,
|
||||
) -> torch.Tensor:
|
||||
@ -1552,7 +1552,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}."
|
||||
)
|
||||
|
||||
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
|
||||
def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
|
||||
"""Validates model kwargs for generation. Generate argument typos will also be caught here."""
|
||||
# If a `Cache` instance is passed, checks whether the model is compatible with it
|
||||
if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
|
||||
@ -1709,8 +1709,8 @@ class GenerationMixin(ContinuousMixin):
|
||||
return generation_config
|
||||
|
||||
def _prepare_generation_config(
|
||||
self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Dict
|
||||
) -> Tuple[GenerationConfig, Dict]:
|
||||
self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict
|
||||
) -> tuple[GenerationConfig, dict]:
|
||||
"""
|
||||
Prepares the base generation config, then applies any generation configuration options from kwargs. This
|
||||
function handles retrocompatibility with respect to configuration files.
|
||||
@ -1821,7 +1821,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
model_kwargs["cache_position"] = cache_position
|
||||
return model_kwargs
|
||||
|
||||
def _get_layer_device_map_for_cache_init(self) -> Optional[Dict[int, Union[str, int]]]:
|
||||
def _get_layer_device_map_for_cache_init(self) -> Optional[dict[int, Union[str, int]]]:
|
||||
"""
|
||||
Returns the device map for each decoder layer, to allocate the cache on the right device.
|
||||
Inspired from `dispatch_model` in accelerate.
|
||||
@ -1982,7 +1982,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
def _prepare_cache_for_generation(
|
||||
self,
|
||||
generation_config: GenerationConfig,
|
||||
model_kwargs: Dict,
|
||||
model_kwargs: dict,
|
||||
assistant_model: "PreTrainedModel",
|
||||
batch_size: int,
|
||||
max_cache_length: int,
|
||||
@ -2191,7 +2191,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
generation_config._pad_token_tensor = pad_token_tensor
|
||||
generation_config._decoder_start_token_tensor = decoder_start_token_tensor
|
||||
|
||||
def _valid_auto_compile_criteria(self, model_kwargs: Dict, generation_config: GenerationConfig) -> bool:
|
||||
def _valid_auto_compile_criteria(self, model_kwargs: dict, generation_config: GenerationConfig) -> bool:
|
||||
"""
|
||||
Determines whether to trigger auto-compilation of the model's forward pass at generation time.
|
||||
"""
|
||||
@ -2239,7 +2239,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
generation_config: Optional[GenerationConfig] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
|
||||
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
|
||||
synced_gpus: Optional[bool] = None,
|
||||
assistant_model: Optional["PreTrainedModel"] = None,
|
||||
streamer: Optional["BaseStreamer"] = None,
|
||||
@ -2287,7 +2287,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
|
||||
sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
|
||||
intended for advanced users.
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
|
||||
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
|
||||
If provided, this function constraints the beam search to allowed tokens only at each step. If not
|
||||
provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
|
||||
`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
|
||||
@ -2321,7 +2321,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
|
||||
standard `generate` method. Note that the logic is for generation is entirely defined in that
|
||||
repository, and the return type may be different from the standard `generate` method.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
|
||||
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
||||
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
|
||||
@ -2695,7 +2695,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
|
||||
def typeerror():
|
||||
raise ValueError(
|
||||
"`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
|
||||
"`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]` "
|
||||
f"of positive integers, but is {generation_config.force_words_ids}."
|
||||
)
|
||||
|
||||
@ -2871,7 +2871,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
def _dola_decoding(
|
||||
self,
|
||||
input_ids: torch.LongTensor,
|
||||
dola_layers: Union[str, List[int]],
|
||||
dola_layers: Union[str, list[int]],
|
||||
logits_processor: LogitsProcessorList,
|
||||
stopping_criteria: StoppingCriteriaList,
|
||||
generation_config: GenerationConfig,
|
||||
@ -2888,7 +2888,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
Parameters:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
The sequence used as a prompt for the generation.
|
||||
dola_layers (`Union[str, List[int]]`):
|
||||
dola_layers (`Union[str, list[int]]`):
|
||||
The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
|
||||
means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
|
||||
to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
|
||||
@ -3806,7 +3806,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
num_beams: int,
|
||||
vocab_size: int,
|
||||
batch_size: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Get top-K continuations given the accumulated log probs on the next token.
|
||||
|
||||
@ -3855,7 +3855,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
topk_running_beam_indices: torch.Tensor,
|
||||
next_token_hits_stopping_criteria: torch.Tensor,
|
||||
num_beams: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
|
||||
best non-finished beams to continue beam search in the next iteration.
|
||||
@ -3886,7 +3886,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
decoder_prompt_len: int,
|
||||
length_penalty: float,
|
||||
early_stopping: Union[bool, str],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
|
||||
the current finished sequences.
|
||||
@ -5236,8 +5236,8 @@ def _split(data, full_batch_size: int, split_size: int):
|
||||
|
||||
|
||||
def _split_model_inputs(
|
||||
model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int, config: PretrainedConfig
|
||||
) -> List[Union[ModelOutput, Dict]]:
|
||||
model_input: Union[ModelOutput, dict], split_size: int, full_batch_size: int, config: PretrainedConfig
|
||||
) -> list[Union[ModelOutput, dict]]:
|
||||
"""
|
||||
Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
|
||||
size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
|
||||
@ -5292,14 +5292,14 @@ def _split_model_inputs(
|
||||
]
|
||||
|
||||
# Convert each dictionary in the list to an object of the inferred class
|
||||
split_model_inputs: List[Union[ModelOutput, Dict]] = [
|
||||
split_model_inputs: list[Union[ModelOutput, dict]] = [
|
||||
model_output_cls(**data_split, **bool_data) for data_split in data_split_list
|
||||
]
|
||||
|
||||
return split_model_inputs
|
||||
|
||||
|
||||
def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConfig) -> ModelOutput:
|
||||
def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConfig) -> ModelOutput:
|
||||
"""
|
||||
Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
|
||||
specific ModelOutput subclass from the list provided.
|
||||
@ -5379,8 +5379,8 @@ def _relative_top_filter(
|
||||
|
||||
|
||||
def _dola_select_contrast(
|
||||
candidate_premature_layers: List[int],
|
||||
candidate_premature_logits: Dict[int, torch.FloatTensor],
|
||||
candidate_premature_layers: list[int],
|
||||
candidate_premature_logits: dict[int, torch.FloatTensor],
|
||||
final_logits: torch.FloatTensor,
|
||||
) -> torch.FloatTensor:
|
||||
if len(candidate_premature_layers) == 1:
|
||||
|
@ -16,7 +16,7 @@
|
||||
import collections
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -126,7 +126,7 @@ class WatermarkDetector:
|
||||
self,
|
||||
model_config: PretrainedConfig,
|
||||
device: str,
|
||||
watermarking_config: Union[WatermarkingConfig, Dict],
|
||||
watermarking_config: Union[WatermarkingConfig, dict],
|
||||
ignore_repeated_ngrams: bool = False,
|
||||
max_cache_size: int = 128,
|
||||
):
|
||||
@ -300,7 +300,7 @@ class BayesianDetectorWatermarkedLikelihood(nn.Module):
|
||||
self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth))
|
||||
self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth))
|
||||
|
||||
def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def _compute_latents(self, g_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Computes the unique token probability distribution given g-values.
|
||||
|
||||
Args:
|
||||
|
@ -81,7 +81,7 @@ def HfArg(
|
||||
```
|
||||
|
||||
Args:
|
||||
aliases (Union[str, List[str]], optional):
|
||||
aliases (Union[str, list[str]], optional):
|
||||
Single string or list of strings of aliases to pass on to argparse, e.g. `aliases=["--example", "-e"]`.
|
||||
Defaults to None.
|
||||
help (str, optional): Help string to pass on to argparse that can be displayed with --help. Defaults to None.
|
||||
@ -119,7 +119,7 @@ class HfArgumentParser(ArgumentParser):
|
||||
Args:
|
||||
dataclass_types (`DataClassType` or `Iterable[DataClassType]`, *optional*):
|
||||
Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Passed to `argparse.ArgumentParser()` in the regular way.
|
||||
"""
|
||||
|
||||
|
@ -127,7 +127,7 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
resume_download:
|
||||
Deprecated and ignored. All downloads are now resumed by default when possible.
|
||||
Will be removed in v5 of Transformers.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
proxies (`dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||
token (`str` or `bool`, *optional*):
|
||||
@ -153,7 +153,7 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
|
||||
specify the folder name here.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
The values in kwargs of any keys which are image processor attributes will be used to override the
|
||||
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
|
||||
controlled by the `return_unused_kwargs` keyword parameter.
|
||||
@ -219,7 +219,7 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
|
||||
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
|
||||
namespace).
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
"""
|
||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||
@ -286,7 +286,7 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
The name of the file in the model directory to use for the image processor config.
|
||||
|
||||
Returns:
|
||||
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
||||
`tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
|
||||
"""
|
||||
cache_dir = kwargs.pop("cache_dir", None)
|
||||
force_download = kwargs.pop("force_download", False)
|
||||
@ -387,11 +387,11 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
|
||||
|
||||
Args:
|
||||
image_processor_dict (`Dict[str, Any]`):
|
||||
image_processor_dict (`dict[str, Any]`):
|
||||
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
|
||||
retrieved from a pretrained checkpoint by leveraging the
|
||||
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
|
||||
kwargs (`Dict[str, Any]`):
|
||||
kwargs (`dict[str, Any]`):
|
||||
Additional parameters from which to initialize the image processor object.
|
||||
|
||||
Returns:
|
||||
@ -431,7 +431,7 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
Serializes this instance to a Python dictionary.
|
||||
|
||||
Returns:
|
||||
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
||||
`dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output["image_processor_type"] = self.__class__.__name__
|
||||
|
@ -130,7 +130,7 @@ class BaseImageProcessor(ImageProcessingMixin):
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
Image to center crop.
|
||||
size (`Dict[str, int]`):
|
||||
size (`dict[str, int]`):
|
||||
Size of the output image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. If unset, the channel dimension format of the input
|
||||
@ -227,7 +227,7 @@ def get_size_dict(
|
||||
is set, it is added to the dict as `{"longest_edge": max_size}`.
|
||||
|
||||
Args:
|
||||
size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
|
||||
size (`Union[int, Iterable[int], dict[str, int]]`, *optional*):
|
||||
The `size` parameter to be cast into a size dictionary.
|
||||
max_size (`Optional[int]`, *optional*):
|
||||
The `max_size` parameter to be cast into a size dictionary.
|
||||
|
@ -382,7 +382,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
Args:
|
||||
image (`"torch.Tensor"`):
|
||||
Image to center crop.
|
||||
size (`Dict[str, int]`):
|
||||
size (`dict[str, int]`):
|
||||
Size of the output image.
|
||||
|
||||
Returns:
|
||||
@ -666,12 +666,12 @@ class SemanticSegmentationMixin:
|
||||
Args:
|
||||
outputs ([`MobileNetV2ForSemanticSegmentation`]):
|
||||
Raw outputs of the model.
|
||||
target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
|
||||
target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
|
||||
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
|
||||
predictions will not be resized.
|
||||
|
||||
Returns:
|
||||
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
|
||||
semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
|
||||
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
|
||||
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
|
||||
"""
|
||||
|
@ -217,7 +217,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in
|
||||
Computes the output image size given the input image size and the desired output size.
|
||||
|
||||
Args:
|
||||
image_size (`Tuple[int, int]`):
|
||||
image_size (`tuple[int, int]`):
|
||||
The input image size.
|
||||
size (`int`):
|
||||
The desired output size.
|
||||
@ -266,7 +266,7 @@ def get_resize_output_image_size(
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
|
||||
size (`int` or `tuple[int, int]` or list[int] or `tuple[int]`):
|
||||
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
|
||||
this.
|
||||
|
||||
@ -334,7 +334,7 @@ def resize(
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to resize.
|
||||
size (`Tuple[int, int]`):
|
||||
size (`tuple[int, int]`):
|
||||
The size to use for resizing the image.
|
||||
resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
The filter to user for resampling.
|
||||
@ -464,7 +464,7 @@ def center_crop(
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to crop.
|
||||
size (`Tuple[int, int]`):
|
||||
size (`tuple[int, int]`):
|
||||
The target size for the cropped image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
@ -704,7 +704,7 @@ def pad(
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to pad.
|
||||
padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
|
||||
padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
|
||||
Padding to apply to the edges of the height, width axes. Can be one of three formats:
|
||||
- `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
|
||||
- `((before, after),)` yields same before and after pad for height and width.
|
||||
|
@ -218,7 +218,7 @@ def make_flat_list_of_images(
|
||||
Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
|
||||
If the input is a nested list of images, it is converted to a flat list of images.
|
||||
Args:
|
||||
images (`Union[List[ImageInput], ImageInput]`):
|
||||
images (`Union[list[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
Returns:
|
||||
list: A list of images or a 4d array of images.
|
||||
@ -252,7 +252,7 @@ def make_nested_list_of_images(
|
||||
"""
|
||||
Ensure that the output is a nested list of images.
|
||||
Args:
|
||||
images (`Union[List[ImageInput], ImageInput]`):
|
||||
images (`Union[list[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
Returns:
|
||||
list: A list of list of images or a list of 4d array of images.
|
||||
@ -300,7 +300,7 @@ def infer_channel_dimension_format(
|
||||
Args:
|
||||
image (`np.ndarray`):
|
||||
The image to infer the channel dimension of.
|
||||
num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
|
||||
num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
|
||||
The number of channels of the image.
|
||||
|
||||
Returns:
|
||||
@ -393,7 +393,7 @@ def get_image_size_for_max_height_width(
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
image_size (`Tuple[int, int]`):
|
||||
image_size (`tuple[int, int]`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
@ -678,9 +678,9 @@ class ImageFeatureExtractionMixin:
|
||||
Args:
|
||||
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
||||
The image to normalize.
|
||||
mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
|
||||
mean (`list[float]` or `np.ndarray` or `torch.Tensor`):
|
||||
The mean (per channel) to use for normalization.
|
||||
std (`List[float]` or `np.ndarray` or `torch.Tensor`):
|
||||
std (`list[float]` or `np.ndarray` or `torch.Tensor`):
|
||||
The standard deviation (per channel) to use for normalization.
|
||||
rescale (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
|
||||
@ -729,7 +729,7 @@ class ImageFeatureExtractionMixin:
|
||||
Args:
|
||||
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
|
||||
The image to resize.
|
||||
size (`int` or `Tuple[int, int]`):
|
||||
size (`int` or `tuple[int, int]`):
|
||||
The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
|
||||
matched to this.
|
||||
|
||||
@ -797,7 +797,7 @@ class ImageFeatureExtractionMixin:
|
||||
Args:
|
||||
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
|
||||
The image to resize.
|
||||
size (`int` or `Tuple[int, int]`):
|
||||
size (`int` or `tuple[int, int]`):
|
||||
The size to which crop the image.
|
||||
|
||||
Returns:
|
||||
|
@ -156,7 +156,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
|
||||
model (`torch.nn.Module`): The model to inspect.
|
||||
|
||||
Returns:
|
||||
List[List[str]]: A list of lists of parameter names being all tied together.
|
||||
list[list[str]]: A list of lists of parameter names being all tied together.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -306,7 +306,7 @@ def _fuse_awq_layernorm(fuse_module_names, module, target_cls):
|
||||
Fuse the LayerNorm layers into a target class using autoawq
|
||||
|
||||
Args:
|
||||
fuse_module_names (`List[str]`):
|
||||
fuse_module_names (`list[str]`):
|
||||
The list of module names to fuse
|
||||
module (`nn.Module`):
|
||||
The pytorch parent module that has layernorm modules to fuse
|
||||
@ -333,7 +333,7 @@ def _fuse_awq_mlp(model, current_module_name, fuse_module_names, module, target_
|
||||
The input pretrained model
|
||||
current_module_name (`str`):
|
||||
The current submodule name
|
||||
fuse_module_names (`List[str]`):
|
||||
fuse_module_names (`list[str]`):
|
||||
The list of module names to fuse. For the MLP layers it has to be an array
|
||||
of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
|
||||
module (`nn.Module`):
|
||||
@ -374,7 +374,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
|
||||
The input pretrained model
|
||||
module (`nn.Module`):
|
||||
The pytorch parent module that has layernorm modules to fuse
|
||||
modules_to_fuse (`List[str]`):
|
||||
modules_to_fuse (`list[str]`):
|
||||
The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
|
||||
in the correct order: q, k, v, o layer
|
||||
current_module_name (`str`):
|
||||
|
@ -398,10 +398,10 @@ def replace_with_bitnet_linear(
|
||||
Parameters:
|
||||
model (`torch.nn.Module`):
|
||||
Input model or `torch.nn.Module` as the function is run recursively.
|
||||
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
Names of the modules to not convert in `BitLinear`. In practice we keep the `lm_head` in full precision
|
||||
for numerical stability reasons.
|
||||
current_key_name (`List[`str`]`, *optional*):
|
||||
current_key_name (`list[`str`]`, *optional*):
|
||||
An array to track the current key of the recursion. This is used to check whether the current key (part of
|
||||
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
|
||||
`disk`).
|
||||
|
@ -243,10 +243,10 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
|
||||
Parameters:
|
||||
model (`torch.nn.Module`):
|
||||
Input model or `torch.nn.Module` as the function is run recursively.
|
||||
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
|
||||
for numerical stability reasons.
|
||||
current_key_name (`List[`str`]`, *optional*):
|
||||
current_key_name (`list[`str`]`, *optional*):
|
||||
An array to track the current key of the recursion. This is used to check whether the current key (part of
|
||||
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
|
||||
`disk`).
|
||||
|
@ -93,10 +93,10 @@ def replace_with_eetq_linear(
|
||||
Parameters:
|
||||
model (`torch.nn.Module`):
|
||||
Input model or `torch.nn.Module` as the function is run recursively.
|
||||
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision
|
||||
for numerical stability reasons.
|
||||
current_key_name (`List[`str`]`, *optional*):
|
||||
current_key_name (`list[`str`]`, *optional*):
|
||||
An array to track the current key of the recursion. This is used to check whether the current key (part of
|
||||
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
|
||||
`disk`).
|
||||
|
@ -251,10 +251,10 @@ def replace_with_fbgemm_fp8_linear(
|
||||
Parameters:
|
||||
model (`torch.nn.Module`):
|
||||
Input model or `torch.nn.Module` as the function is run recursively.
|
||||
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
|
||||
Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision
|
||||
for numerical stability reasons.
|
||||
current_key_name (`List[`str`]`, *optional*):
|
||||
current_key_name (`list[`str`]`, *optional*):
|
||||
An array to track the current key of the recursion. This is used to check whether the current key (part of
|
||||
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
|
||||
`disk`).
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging
|
||||
|
||||
@ -45,7 +45,7 @@ def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
|
||||
tl.store(s_ptr + pid, s)
|
||||
|
||||
|
||||
def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def act_quant(x: torch.Tensor, block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.is_contiguous()
|
||||
assert x.shape[-1] % block_size == 0
|
||||
y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
|
||||
@ -149,7 +149,7 @@ def w8a8_block_fp8_matmul_triton(
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size: List[int],
|
||||
block_size: list[int],
|
||||
output_dtype: torch.dtype = torch.float32,
|
||||
) -> torch.Tensor:
|
||||
"""This function performs matrix multiplication with block-wise
|
||||
@ -231,7 +231,7 @@ def w8a8_block_fp8_matmul_compile(
|
||||
weight_q: torch.Tensor, # [out_features, hidden_dim]
|
||||
input_scale: torch.Tensor, # [batch * seq_len, num_input_groups]
|
||||
weight_scale: torch.Tensor, # [num_weight_blocks_m, num_weight_blocks_n]
|
||||
block_size: Optional[Tuple[int, int]] = None, # (M=128, N=128) for weights for example
|
||||
block_size: Optional[tuple[int, int]] = None, # (M=128, N=128) for weights for example
|
||||
output_dtype: torch.dtype = torch.float32,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
@ -300,7 +300,7 @@ class FP8Linear(nn.Linear):
|
||||
out_features: int,
|
||||
bias: bool = False,
|
||||
dtype=None,
|
||||
block_size: Optional[Tuple[int, int]] = None,
|
||||
block_size: Optional[tuple[int, int]] = None,
|
||||
device=None,
|
||||
activation_scheme="dynamic",
|
||||
):
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@ -22,7 +22,7 @@ def flash_attention_forward(
|
||||
sliding_window: Optional[int] = None,
|
||||
softcap: Optional[float] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, None]:
|
||||
) -> tuple[torch.Tensor, None]:
|
||||
if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None:
|
||||
logger.warning_once(
|
||||
"`flash_attention_2` does not support `output_attentions=True` or `head_mask`."
|
||||
|
@ -26,7 +26,7 @@ Citation:
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
@ -106,7 +106,7 @@ def make_flex_block_causal_mask(
|
||||
attention_chunk_size: Optional[int] = None,
|
||||
query_length=None,
|
||||
key_length=None,
|
||||
offsets: Optional[Tuple[Offset, Offset]] = None,
|
||||
offsets: Optional[tuple[Offset, Offset]] = None,
|
||||
is_causal: Optional[bool] = True,
|
||||
) -> "BlockMask":
|
||||
"""
|
||||
@ -234,7 +234,7 @@ def flex_attention_forward(
|
||||
softcap: Optional[float] = None,
|
||||
head_mask: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if head_mask is not None:
|
||||
logger.warning_once(
|
||||
"`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature."
|
||||
|
@ -11,7 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Dict, Union
|
||||
from typing import Union
|
||||
|
||||
from ..utils import is_torchdynamo_compiling
|
||||
|
||||
@ -29,7 +29,7 @@ try:
|
||||
|
||||
_hub_kernels_available = True
|
||||
|
||||
_KERNEL_MAPPING: Dict[str, Dict[Union[Device, str], LayerRepository]] = {
|
||||
_KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
|
||||
"MultiScaleDeformableAttention": {
|
||||
"cuda": LayerRepository(
|
||||
repo_id="kernels-community/deformable-detr",
|
||||
|
@ -29,7 +29,7 @@ import tempfile
|
||||
from dataclasses import asdict, fields
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import packaging.version
|
||||
@ -1692,7 +1692,7 @@ class NeptuneCallback(TrainerCallback):
|
||||
|
||||
raise Exception("The trainer doesn't have a NeptuneCallback configured.")
|
||||
|
||||
def on_log(self, args, state, control, logs: Optional[Dict[str, float]] = None, **kwargs):
|
||||
def on_log(self, args, state, control, logs: Optional[dict[str, float]] = None, **kwargs):
|
||||
if not state.is_world_process_zero:
|
||||
return
|
||||
|
||||
|
@ -16,7 +16,7 @@ import importlib
|
||||
import inspect
|
||||
import re
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from packaging import version
|
||||
|
||||
@ -100,11 +100,11 @@ class PeftAdapterMixin:
|
||||
max_memory: Optional[str] = None,
|
||||
offload_folder: Optional[str] = None,
|
||||
offload_index: Optional[int] = None,
|
||||
peft_config: Optional[Dict[str, Any]] = None,
|
||||
adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None,
|
||||
peft_config: Optional[dict[str, Any]] = None,
|
||||
adapter_state_dict: Optional[dict[str, "torch.Tensor"]] = None,
|
||||
low_cpu_mem_usage: bool = False,
|
||||
is_trainable: bool = False,
|
||||
adapter_kwargs: Optional[Dict[str, Any]] = None,
|
||||
adapter_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
|
||||
@ -133,7 +133,7 @@ class PeftAdapterMixin:
|
||||
Whether to use authentication token to load the remote folder. Useful to load private repositories
|
||||
that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to
|
||||
cache it.
|
||||
device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
|
||||
device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
|
||||
A map that specifies where each submodule should go. It doesn't need to be refined to each
|
||||
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
|
||||
same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
|
||||
@ -150,10 +150,10 @@ class PeftAdapterMixin:
|
||||
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
||||
offload_index (`int`, `optional`):
|
||||
`offload_index` argument to be passed to `accelerate.dispatch_model` method.
|
||||
peft_config (`Dict[str, Any]`, *optional*):
|
||||
peft_config (`dict[str, Any]`, *optional*):
|
||||
The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts
|
||||
methods. This argument is used in case users directly pass PEFT state dicts
|
||||
adapter_state_dict (`Dict[str, torch.Tensor]`, *optional*):
|
||||
adapter_state_dict (`dict[str, torch.Tensor]`, *optional*):
|
||||
The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
|
||||
dicts
|
||||
low_cpu_mem_usage (`bool`, *optional*, defaults to `False`):
|
||||
@ -162,7 +162,7 @@ class PeftAdapterMixin:
|
||||
is_trainable (`bool`, *optional*, defaults to `False`):
|
||||
Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
|
||||
used for inference.
|
||||
adapter_kwargs (`Dict[str, Any]`, *optional*):
|
||||
adapter_kwargs (`dict[str, Any]`, *optional*):
|
||||
Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
|
||||
`find_adapter_config_file` method.
|
||||
"""
|
||||
@ -348,7 +348,7 @@ class PeftAdapterMixin:
|
||||
|
||||
self.set_adapter(adapter_name)
|
||||
|
||||
def set_adapter(self, adapter_name: Union[List[str], str]) -> None:
|
||||
def set_adapter(self, adapter_name: Union[list[str], str]) -> None:
|
||||
"""
|
||||
If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
|
||||
official documentation: https://huggingface.co/docs/peft
|
||||
@ -356,7 +356,7 @@ class PeftAdapterMixin:
|
||||
Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
|
||||
|
||||
Args:
|
||||
adapter_name (`Union[List[str], str]`):
|
||||
adapter_name (`Union[list[str], str]`):
|
||||
The name of the adapter to set. Can be also a list of strings to set multiple adapters.
|
||||
"""
|
||||
check_peft_version(min_version=MIN_PEFT_VERSION)
|
||||
@ -438,7 +438,7 @@ class PeftAdapterMixin:
|
||||
else:
|
||||
module.disable_adapters = False
|
||||
|
||||
def active_adapters(self) -> List[str]:
|
||||
def active_adapters(self) -> list[str]:
|
||||
"""
|
||||
If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
|
||||
official documentation: https://huggingface.co/docs/peft
|
||||
@ -518,7 +518,7 @@ class PeftAdapterMixin:
|
||||
accelerate (i.e. with `device_map=xxx`)
|
||||
|
||||
Args:
|
||||
device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
|
||||
device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
|
||||
A map that specifies where each submodule should go. It doesn't need to be refined to each
|
||||
parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
|
||||
same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
|
||||
@ -562,12 +562,12 @@ class PeftAdapterMixin:
|
||||
**dispatch_model_kwargs,
|
||||
)
|
||||
|
||||
def delete_adapter(self, adapter_names: Union[List[str], str]) -> None:
|
||||
def delete_adapter(self, adapter_names: Union[list[str], str]) -> None:
|
||||
"""
|
||||
Delete an adapter's LoRA layers from the underlying model.
|
||||
|
||||
Args:
|
||||
adapter_names (`Union[List[str], str]`):
|
||||
adapter_names (`Union[list[str], str]`):
|
||||
The name(s) of the adapter(s) to delete.
|
||||
|
||||
Example:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user