No more Tuple, List, Dict (#38797)

* No more Tuple, List, Dict * make fixup * More style fixes * Docstring fixes with regex replacement * Trigger tests * Redo fixes after rebase * Fix copies * [test all] * update * [test all] * update * [test all] * make style after rebase * Patch the hf_argparser test * Patch the hf_argparser test * style fixes * style fixes * style fixes * Fix docstrings in Cohere test * [test all] --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-02 04:10:06 +06:00 · 2025-06-17 19:37:18 +01:00 · 2025-06-17 19:37:18 +01:00 · 508a704055
commit 508a704055
parent a396f4324b
1291 changed files with 14906 additions and 14941 deletions
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -28,7 +28,7 @@ class MetricsRecorder:
        self.commit_id = commit_id
        self.commit_msg = commit_msg
-    def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
+    def initialise_benchmark(self, metadata: dict[str, str]) -> int:
        """
        Creates a new benchmark, returns the benchmark id
        """
@ -55,7 +55,7 @@ class MetricsRecorder:
            f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
        )
-    def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
+    def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
        with self.conn.cursor() as cur:
            cur.execute(
                """
@ -85,7 +85,7 @@ handler.setFormatter(formatter)
 logger.addHandler(handler)
-def parse_arguments() -> Tuple[str, str, str, str]:
+def parse_arguments() -> tuple[str, str, str, str]:
    """
    Parse command line arguments for the benchmarking CLI.
    """
--- a/docs/README.md
+++ b/docs/README.md
@ -278,7 +278,7 @@ Here's an example of a single value return:
 ```python
    Returns:
-        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+        `list[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```
 Here's an example of a tuple return, comprising several objects:
--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@ -30,7 +30,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -571,7 +571,7 @@ The processor should call the appropriate modality-specific processors within it
 def __call__(
    self,
    images: ImageInput = None,
-    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+    text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
    audio=None,
    videos=None,
    **kwargs: Unpack[YourModelProcessorKwargs],
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -92,7 +92,7 @@ def custom_attention(
    a_new_kwargs = None,  # You can now add as many kwargs as you need
    another_new_kwargs = None,  # You can now add as many kwargs as you need
    **kwargs,  # You need to accept **kwargs as models will pass other args
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]
    ...  # do your magic!
    return attn_output, attn_weights  # attn_weights are optional here
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -152,7 +152,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
 | `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
 | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
-| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
+| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
 ## Pitfalls
--- a/docs/source/en/model_doc/bros.md
+++ b/docs/source/en/model_doc/bros.md
@ -62,11 +62,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
    box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
-    # encode(tokenize) each word from words (List[str])
+    # encode(tokenize) each word from words (list[str])
-    input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
+    input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
    # get the length of each box
-    tokens_length_list: List[int] = [len(l) for l in input_ids_list]
+    tokens_length_list: list[int] = [len(l) for l in input_ids_list]
    box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
    box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -149,7 +149,7 @@ As a summary, consider the following table:
 | **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
 | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
 | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
-| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
+| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `list[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -83,7 +83,7 @@ def read_video_pyav(container, indices):
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
-        indices (`List[int]`): List of frame indices to decode.
+        indices (`list[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -216,12 +216,12 @@ class Olmo2Attention(OlmoAttention):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
@ -294,9 +294,9 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        # Self Attention
@ -494,7 +494,7 @@ class LlamaForCausalLM(nn.Module):
      input_ids: torch.LongTensor = None,
      attention_mask: Optional[torch.Tensor] = None,
      position_ids: Optional[torch.LongTensor] = None,
-      past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+      past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
      inputs_embeds: Optional[torch.FloatTensor] = None,
      labels: Optional[torch.LongTensor] = None,
      use_cache: Optional[bool] = None,
@ -520,7 +520,7 @@ class NewModelForCausalLM(LlamaForCausalLM):    |    class LlamaForCausalLM(nn.M
                                                |         input_ids: torch.LongTensor = None,
                                                |         attention_mask: Optional[torch.Tensor] = None,
                                                |         position_ids: Optional[torch.LongTensor] = None,
-                                                |         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None,
+                                                |         past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = |None,
                                                |         inputs_embeds: Optional[torch.FloatTensor] = None,
                                                |         labels: Optional[torch.LongTensor] = None,
                                                |         use_cache: Optional[bool] = None,
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@ -170,7 +170,7 @@ Unlike other data collators, this specific data collator needs to apply a differ
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # split inputs and labels since they have to be of different lengths and need
 ...         # different padding methods
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@ -243,7 +243,7 @@ and it uses the exact same dataset as an example. Apply some geometric and color
 ... )
 ```
-The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': List[Dict]}`,
+The `image_processor` expects the annotations to be in the following format: `{'image_id': int, 'annotations': list[Dict]}`,
 where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
 ```py
@ -252,9 +252,9 @@ The `image_processor` expects the annotations to be in the following format: `{'
 ...     Args:
 ...         image_id (str): image id. e.g. "0001"
-...         categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
+...         categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
-...         areas (List[float]): list of corresponding areas to provided bounding boxes
+...         areas (list[float]): list of corresponding areas to provided bounding boxes
-...         bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+...         bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
 ...             ([center_x, center_y, width, height] in absolute coordinates)
 ...     Returns:
@ -397,7 +397,7 @@ Intermediate format of boxes used for training is `YOLO` (normalized) but we wil
 ...     Args:
 ...         boxes (torch.Tensor): Bounding boxes in YOLO format
-...         image_size (Tuple[int, int]): Image size in format (height, width)
+...         image_size (tuple[int, int]): Image size in format (height, width)
 ...     Returns:
 ...         torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@ -408,7 +408,7 @@ instructs the model to ignore that part of the spectrogram when calculating the
 ... class TTSDataCollatorWithPadding:
 ...     processor: Any
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
 ...         label_features = [{"input_values": feature["labels"]} for feature in features]
 ...         speaker_features = [feature["speaker_embeddings"] for feature in features]
--- a/docs/source/es/custom_models.md
+++ b/docs/source/es/custom_models.md
@ -48,7 +48,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/es/tasks/asr.md
+++ b/docs/source/es/tasks/asr.md
@ -166,7 +166,7 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # particiona las entradas y las etiquetas ya que tienen que tener longitudes distintas y
 ...         # requieren métodos de padding diferentes
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/docs/source/it/custom_models.md
+++ b/docs/source/it/custom_models.md
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/ja/custom_models.md
+++ b/docs/source/ja/custom_models.md
@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/ja/hpo_train.md
+++ b/docs/source/ja/hpo_train.md
@ -56,7 +56,7 @@ Optunaに関しては、[object_parameter](https://optuna.readthedocs.io/en/stab
 ...     }
 ```
-Optunaは、多目的のハイパーパラメータ最適化（HPO）を提供しています。 `hyperparameter_search` で `direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front（`List[BestRun]`）は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。
+Optunaは、多目的のハイパーパラメータ最適化（HPO）を提供しています。 `hyperparameter_search` で `direction` を渡し、複数の目的関数値を返すための独自の `compute_objective` を定義することができます。 Pareto Front（`list[BestRun]`）は `hyperparameter_search` で返され、[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py) のテストケース `TrainerHyperParameterMultiObjectOptunaIntegrationTest` を参照する必要があります。これは以下のようになります。
 ```py
--- a/docs/source/ja/model_doc/bros.md
+++ b/docs/source/ja/model_doc/bros.md
@ -57,11 +57,11 @@ def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
    box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
-    # encode(tokenize) each word from words (List[str])
+    # encode(tokenize) each word from words (list[str])
-    input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
+    input_ids_list: list[list[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
    # get the length of each box
-    tokens_length_list: List[int] = [len(l) for l in input_ids_list]
+    tokens_length_list: list[int] = [len(l) for l in input_ids_list]
    box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
    box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
--- a/docs/source/ja/model_doc/detr.md
+++ b/docs/source/ja/model_doc/detr.md
@ -149,7 +149,7 @@ DETR モデルをインスタンス化するには 3 つの方法があります
 | **Description** |画像内のオブジェクトの周囲の境界ボックスとクラス ラベルを予測する | 画像内のオブジェクト (つまりインスタンス) の周囲のマスクを予測する | 画像内のオブジェクト (インスタンス) と「もの」 (木や道路などの背景) の両方の周囲のマスクを予測します |
 | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
 | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
-| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
+| **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `list[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
--- a/docs/source/ja/tasks/asr.md
+++ b/docs/source/ja/tasks/asr.md
@ -170,7 +170,7 @@ MInDS-14 データセットのサンプリング レートは 8000kHz です (
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # split inputs and labels since they have to be of different lengths and need
 ...         # different padding methods
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/docs/source/ja/tasks/object_detection.md
+++ b/docs/source/ja/tasks/object_detection.md
@ -208,7 +208,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
 ... )
 ```
-`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': List[Dict]}`,
+`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': list[Dict]}`,
 ここで、各辞書は COCO オブジェクトの注釈です。 1 つの例として、注釈を再フォーマットする関数を追加してみましょう。
 ```py
--- a/docs/source/ja/tasks/text-to-speech.md
+++ b/docs/source/ja/tasks/text-to-speech.md
@ -408,7 +408,7 @@ Y 軸が反転され、スペクトログラムが上下逆に表示されます
 ... class TTSDataCollatorWithPadding:
 ...     processor: Any
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
 ...         label_features = [{"input_values": feature["labels"]} for feature in features]
 ...         speaker_features = [feature["speaker_embeddings"] for feature in features]
--- a/docs/source/ko/custom_models.md
+++ b/docs/source/ko/custom_models.md
@ -46,7 +46,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/ko/tasks/asr.md
+++ b/docs/source/ko/tasks/asr.md
@ -172,7 +172,7 @@ MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # 입력과 레이블을 분할합니다
 ...         # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@ -201,7 +201,7 @@ DatasetDict({
 ... )
 ```
-이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': List[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
+이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': list[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
 ```py
 >>> def formatted_anns(image_id, category, area, bbox):
--- a/docs/source/pt/custom_models.md
+++ b/docs/source/pt/custom_models.md
@ -47,7 +47,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/zh/custom_models.md
+++ b/docs/source/zh/custom_models.md
@ -39,7 +39,7 @@ class ResnetConfig(PretrainedConfig):
    def __init__(
        self,
        block_type="bottleneck",
-        layers: List[int] = [3, 4, 6, 3],
+        layers: list[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
--- a/docs/source/zh/hpo_train.md
+++ b/docs/source/zh/hpo_train.md
@ -56,7 +56,7 @@ pip install optuna/sigopt/wandb/ray[tune]
 ...     }
 ```
-Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数，并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front（`List[BestRun]`），您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容：
+Optuna提供了多目标HPO。您可以在`hyperparameter_search`中传递`direction`参数，并定义自己的`compute_objective`以返回多个目标值。在`hyperparameter_search`中将返回Pareto Front（`list[BestRun]`），您应该参考[test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py)中的测试用例`TrainerHyperParameterMultiObjectOptunaIntegrationTest`。它类似于以下内容：
 ```py
 >>> best_trials = trainer.hyperparameter_search(
--- a/docs/source/zh/tasks/asr.md
+++ b/docs/source/zh/tasks/asr.md
@ -181,7 +181,7 @@ Wav2Vec2 分词器仅训练了大写字符，因此您需要确保文本与分
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
-...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...     def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
 ...         # split inputs and labels since they have to be of different lengths and need
 ...         # different padding methods
 ...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
--- a/examples/flax/question-answering/utils_qa.py
+++ b/examples/flax/question-answering/utils_qa.py
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@ -184,7 +184,7 @@ class Seq2SeqTrainer(Trainer):
        Args:
            model (:obj:`nn.Module`):
                The model to evaluate.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (:obj:`dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
@ -193,7 +193,7 @@ class Seq2SeqTrainer(Trainer):
                Whether or not to return the loss only.
        Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+            tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
            A tuple with the loss, logits and labels (each being optional).
        """
        inputs = self._prepare_inputs(inputs)
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@ -530,7 +530,7 @@ def calculate_rouge(
        on multi sentence summaries (CNN/DM dataset).
    Returns:
-         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
+         dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
    """
    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -91,11 +91,11 @@ class MyNewModelConfig(PretrainedConfig):
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
+                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
+                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_new_imgproc_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 import numpy as np
 import torch
@ -57,11 +57,11 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
@ -74,13 +74,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
    def __init__(
        self,
        do_resize: bool = True,
-        size: Optional[Dict[str, int]] = None,
+        size: Optional[dict[str, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ) -> None:
@ -101,7 +101,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
    def resize(
        self,
        image: np.ndarray,
-        size: Dict[str, int],
+        size: dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
@ -113,7 +113,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
-            size (`Dict[str, int]`):
+            size (`dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
@ -151,13 +151,13 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
+        size: Optional[dict[str, int]] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        do_convert_rgb: Optional[bool] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
@ -172,7 +172,7 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Controls the size of the image after `resize`. The shortest edge of the image is resized to
                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
@ -185,9 +185,9 @@ class ImgprocModelImageProcessor(BaseImageProcessor):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to normalize the image by if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
--- a/examples/modular-transformers/modeling_add_function.py
+++ b/examples/modular-transformers/modeling_add_function.py
@ -5,7 +5,7 @@
 #                          modular_add_function.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Note that zamba does not have the `apply_rotary_pos_emb` function!
-from typing import Optional, Tuple
+from typing import Optional
 import torch
 from torch import nn
@ -62,5 +62,5 @@ class TestAttention(nn.Module):
    def __init__(self):
        pass
-    def forward(self) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    def forward(self) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        _ = apply_rotary_pos_emb(1, 1, 1, 1)
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_dummy.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -210,12 +210,12 @@ class DummyAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
@ -278,9 +278,9 @@ class DummyDecoderLayer(GradientCheckpointingLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@ -6,7 +6,7 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import os
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 from packaging import version
@ -136,9 +136,9 @@ class DummyBertSelfAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)
        # If this is instantiated as a cross-attention module, the keys
@ -245,9 +245,9 @@ class DummyBertSdpaSelfAttention(DummyBertSelfAttention):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
            logger.warning_once(
@ -386,9 +386,9 @@ class DummyBertAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        self_outputs = self.self(
            hidden_states,
            attention_mask,
@ -454,9 +454,9 @@ class DummyBertLayer(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        self_attention_outputs = self.attention(
@ -532,12 +532,12 @@ class DummyBertEncoder(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
--- a/examples/modular-transformers/modeling_from_uppercase_model.py
+++ b/examples/modular-transformers/modeling_from_uppercase_model.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_from_uppercase_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -71,7 +71,7 @@ class FromUppercaseModelAttention(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, seq_length, embed_dim = hidden_states.shape
@ -153,7 +153,7 @@ class FromUppercaseModelEncoderLayer(nn.Module):
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_multimodal1.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -210,12 +210,12 @@ class Multimodal1TextAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
@ -278,9 +278,9 @@ class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -5,7 +5,7 @@
 #                          modular_multimodal2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -81,7 +81,7 @@ class Multimodal2VisionAttention(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, seq_length, embed_dim = hidden_states.shape
@ -177,7 +177,7 @@ class Multimodal2Attention(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, seq_length, embed_dim = hidden_states.shape
@ -244,7 +244,7 @@ class Multimodal2VisionEncoderLayer(nn.Module):
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_my_new_model2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -208,12 +208,12 @@ class MyNewModel2Attention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
@ -276,9 +276,9 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
@ -469,7 +469,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -5,7 +5,7 @@
 #                          modular_new_task_model.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 from dataclasses import dataclass
-from typing import ClassVar, List, Optional, Tuple, Union
+from typing import ClassVar, Optional, Union
 import torch
 from torch import nn
@ -88,9 +88,9 @@ class NewTaskModelCausalLMOutputWithPast(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
-    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
    image_hidden_states: Optional[torch.FloatTensor] = None
@ -249,7 +249,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
        pixel_values: torch.FloatTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
@ -259,7 +259,7 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, NewTaskModelModelOutputWithPast]:
+    ) -> Union[tuple, NewTaskModelModelOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@ -442,7 +442,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        num_logits_to_keep: int = 0,
-    ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
+    ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@ -6,7 +6,7 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import os
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 import torch.nn as nn
@ -139,9 +139,9 @@ class RobertaSelfAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)
        # If this is instantiated as a cross-attention module, the keys
@ -248,9 +248,9 @@ class RobertaSdpaSelfAttention(RobertaSelfAttention):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
            logger.warning_once(
@ -389,9 +389,9 @@ class RobertaAttention(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        self_outputs = self.self(
            hidden_states,
            attention_mask,
@ -457,9 +457,9 @@ class RobertaLayer(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        self_attention_outputs = self.attention(
@ -535,12 +535,12 @@ class RobertaEncoder(nn.Module):
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@ -903,12 +903,12 @@ class RobertaModel(RobertaPreTrainedModel):
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -4,7 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_super.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 import torch
 from torch import nn
@ -211,12 +211,12 @@ class SuperAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
@ -279,9 +279,9 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
--- a/examples/modular-transformers/modeling_switch_function.py
+++ b/examples/modular-transformers/modeling_switch_function.py
@ -5,7 +5,7 @@
 #                          modular_switch_function.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # Note that llama and cohere have different definitions for rotate_half
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional
 import torch
 from torch import nn
@ -123,12 +123,12 @@ class SwitchFunctionAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
--- a/examples/modular-transformers/modeling_test_detr.py
+++ b/examples/modular-transformers/modeling_test_detr.py
@ -7,7 +7,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 import torch.nn.functional as F
@ -43,7 +43,7 @@ class MultiScaleDeformableAttention(nn.Module):
        self,
        value: Tensor,
        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
+        value_spatial_shapes_list: list[tuple],
        level_start_index: Tensor,
        sampling_locations: Tensor,
        attention_weights: Tensor,
@ -124,9 +124,9 @@ class TestDetrDecoderOutput(ModelOutput):
    last_hidden_state: Optional[torch.FloatTensor] = None
    intermediate_hidden_states: Optional[torch.FloatTensor] = None
    intermediate_reference_points: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
@dataclass
@ -177,12 +177,12 @@ class TestDetrModelOutput(ModelOutput):
    last_hidden_state: Optional[torch.FloatTensor] = None
    intermediate_hidden_states: Optional[torch.FloatTensor] = None
    intermediate_reference_points: Optional[torch.FloatTensor] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
    enc_outputs_class: Optional[torch.FloatTensor] = None
    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
@ -557,7 +557,7 @@ class TestDetrMultiheadAttention(nn.Module):
        attention_mask: Optional[torch.Tensor] = None,
        position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, target_len, embed_dim = hidden_states.size()
@ -1431,7 +1431,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
        Args:
            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
-            spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps.
+            spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
        Returns:
            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
@ -1499,7 +1499,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], TestDetrModelOutput]:
+    ) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
        r"""
        Returns:
--- a/examples/pytorch/3d_parallel_checks.py
+++ b/examples/pytorch/3d_parallel_checks.py
@ -33,7 +33,7 @@ import logging
 import os
 from collections.abc import Iterable
 from contextlib import nullcontext
-from typing import Dict, Optional
+from typing import Optional
 import torch
 import torch.distributed as dist
@ -589,7 +589,7 @@ class ContextParallelCollator:
    def __init__(self, cp_mesh: Optional[DeviceMesh] = None):
        self.cp_mesh = cp_mesh
-    def __call__(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    def __call__(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        batch = default_collate(batch)
        if self.cp_mesh is not None and self.cp_mesh.size() > 1:
            # Get sequence length from the input batch
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -66,9 +66,9 @@ def format_image_annotations_as_coco(
    Args:
        image_id (str): image id. e.g. "0001"
-        categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
+        categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
-        areas (List[float]): list of corresponding areas to provided bounding boxes
+        areas (list[float]): list of corresponding areas to provided bounding boxes
-        bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+        bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
            ([center_x, center_y, width, height] in absolute coordinates)
    Returns:
@ -101,7 +101,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
-        image_size (Tuple[int, int]): Image size in format (height, width)
+        image_size (tuple[int, int]): Image size in format (height, width)
    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -67,9 +67,9 @@ def format_image_annotations_as_coco(
    Args:
        image_id (str): image id. e.g. "0001"
-        categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
+        categories (list[int]): list of categories/class labels corresponding to provided bounding boxes
-        areas (List[float]): list of corresponding areas to provided bounding boxes
+        areas (list[float]): list of corresponding areas to provided bounding boxes
-        bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
+        bboxes (list[tuple[float]]): list of bounding boxes provided in COCO format
            ([center_x, center_y, width, height] in absolute coordinates)
    Returns:
@ -103,7 +103,7 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: tuple[int, int]
    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
-        image_size (Tuple[int, int]): Image size in format (height, width)
+        image_size (tuple[int, int]): Image size in format (height, width)
    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
--- a/examples/pytorch/question-answering/utils_qa.py
+++ b/examples/pytorch/question-answering/utils_qa.py
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
--- a/examples/tensorflow/question-answering/utils_qa.py
+++ b/examples/tensorflow/question-answering/utils_qa.py
@ -47,7 +47,7 @@ def postprocess_qa_predictions(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
@ -270,7 +270,7 @@ def postprocess_qa_predictions_with_beam_search(
    Args:
        examples: The non-preprocessed dataset (see the main script for more information).
        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+        predictions (:obj:`tuple[np.ndarray, np.ndarray]`):
            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
            first dimension must match the number of elements of :obj:`features`.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,7 +22,8 @@ line-length = 119
 ignore = ["C901", "E501", "E741", "F402", "F823" ]
 # RUF013: Checks for the use of implicit Optional
 #  in type annotations when the default parameter value is None.
-select = ["C", "E", "F", "I", "W", "RUF013"]
+select = ["C", "E", "F", "I", "W", "RUF013", "UP006"]
 extend-safe-fixes = ["UP006"]
 # Ignore import violations in all `__init__.py` files.
 [tool.ruff.lint.per-file-ignores]
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -19,7 +19,7 @@ and remove unnecessary dependencies.
 import os
 import warnings
 from io import BytesIO
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import numpy as np
 import requests
@ -70,7 +70,7 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
 AudioInput = Union[
-    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
+    np.ndarray, "torch.Tensor", list[np.ndarray], tuple[np.ndarray], list["torch.Tensor"], tuple["torch.Tensor"]  # noqa: F821
 ]
@ -88,7 +88,7 @@ def make_list_of_audio(
    """
    Ensure that the output is a list of audio.
    Args:
-        audio (`Union[List[AudioInput], AudioInput]`):
+        audio (`Union[list[AudioInput], AudioInput]`):
            The input audio.
    Returns:
        list: A list of audio.
@ -246,7 +246,7 @@ def chroma_filter_bank(
            Tuning deviation from A440 in fractions of a chroma bin.
        power (`float`, *optional*, defaults to 2.0):
            If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
-        weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
+        weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
            If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
            the second element being the Gaussian half-width.
        start_at_c_chroma (`float`, *optional*, defaults to `True`):
@ -733,7 +733,7 @@ def spectrogram_batch(
    Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
    Args:
-        waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`):
+        waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
            The list of input waveforms, each a single-channel (mono) signal.
        window (`np.ndarray` of shape `(frame_length,)`):
            The windowing function to apply, including zero-padding if necessary.
@ -775,7 +775,7 @@ def spectrogram_batch(
            Data type of the output spectrogram.
    Returns:
-        List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
+        list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
    """
    window_length = len(window)
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -4,7 +4,7 @@ import json
 import os
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 import torch
 from packaging import version
@ -28,7 +28,7 @@ def _static_cache_update(
    key_states: torch.Tensor,
    value_states: torch.Tensor,
    cache_position: Optional[torch.LongTensor],
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Updates the static cache tensors in place.
@ -41,7 +41,7 @@ def _static_cache_update(
                                                       If None, the entire cache is overwritten (prefill).
    Returns:
-        Tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place).
+        tuple[`torch.Tensor`, `torch.Tensor`]: The updated key and value cache tensors (modified in-place).
    """
    if cache_position is None:
        # Prefill phase where seq_len potentially equals max_cache_len. Directly copy.
@ -67,7 +67,7 @@ def _sliding_cache_update(
    value_states: torch.Tensor,
    cache_position: torch.LongTensor,
    max_cache_len: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Updates the sliding window cache tensors, returning the potentially modified tensors.
@ -80,7 +80,7 @@ def _sliding_cache_update(
        max_cache_len (`int`): The maximum length of the sliding window cache.
    Returns:
-        Tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update.
+        tuple[`torch.Tensor`, `torch.Tensor`]: The key and value tensors representing the cache state after the update.
                                               For prefill > window, these are the full input states.
                                               Otherwise, they are the updated cache tensors.
    """
@ -134,8 +134,8 @@ class Cache:
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
@ -146,7 +146,7 @@ class Cache:
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
+            cache_kwargs (`dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
                cache to be created.
@ -222,7 +222,7 @@ class CacheConfig:
        """
        Constructs a CacheConfig instance from a dictionary of parameters.
        Args:
-            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
+            config_dict (dict[str, Any]): Dictionary containing configuration parameters.
            **kwargs: Additional keyword arguments to override dictionary values.
        Returns:
@ -257,10 +257,10 @@ class CacheConfig:
            writer.write(json_string)
    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
        """
        Serializes this instance to a Python dictionary. Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        """
        return copy.deepcopy(self.__dict__)
@ -289,11 +289,11 @@ class CacheConfig:
        returning all the unused kwargs.
        Args:
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Dictionary of attributes to tentatively update this class.
        Returns:
-            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+            `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
        """
        to_remove = []
        for key, value in kwargs.items():
@ -473,8 +473,8 @@ class DynamicCache(Cache):
    def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
        super().__init__()
        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        # `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121
        # and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the
@ -487,7 +487,7 @@ class DynamicCache(Cache):
                self.key_cache.append(key_states)
                self.value_cache.append(value_states)
-    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
        sequence length.
@ -517,8 +517,8 @@ class DynamicCache(Cache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
@ -529,7 +529,7 @@ class DynamicCache(Cache):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
+            cache_kwargs (`dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
        Return:
@ -574,7 +574,7 @@ class DynamicCache(Cache):
        """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
        return None
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
        backward compatibility."""
        legacy_cache = ()
@ -584,7 +584,7 @@ class DynamicCache(Cache):
    @classmethod
    def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor, torch.FloatTensor]]] = None
+        cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor, torch.FloatTensor]]] = None
    ) -> "DynamicCache":
        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
        backward compatibility."""
@ -611,7 +611,7 @@ class DynamicCache(Cache):
                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
                self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
-    def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
+    def batch_split(self, full_batch_size: int, split_size: int) -> list["DynamicCache"]:
        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`"""
        out = []
@ -624,7 +624,7 @@ class DynamicCache(Cache):
        return out
    @classmethod
-    def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
+    def from_batch_splits(cls, splits: list["DynamicCache"]) -> "DynamicCache":
        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
        `generation.utils`"""
        cache = cls()
@ -762,7 +762,7 @@ class OffloadedCache(DynamicCache):
            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
-    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
        if layer_idx < len(self):
            # Evict the previous layer if necessary
@ -799,8 +799,8 @@ class OffloadedCache(DynamicCache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        Parameters:
@ -810,7 +810,7 @@ class OffloadedCache(DynamicCache):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
+            cache_kwargs (`dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`.
        Return:
            A tuple containing the updated key and value states.
@ -857,8 +857,8 @@ class QuantizedCache(DynamicCache):
    def __init__(self, cache_config: QuantizedCacheConfig) -> None:
        super().__init__()
-        self._quantized_key_cache: List[torch.Tensor] = []
+        self._quantized_key_cache: list[torch.Tensor] = []
-        self._quantized_value_cache: List[torch.Tensor] = []
+        self._quantized_value_cache: list[torch.Tensor] = []
        self.nbits = cache_config.nbits
        self.residual_length = cache_config.residual_length
@ -875,8 +875,8 @@ class QuantizedCache(DynamicCache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Update the number of seen tokens
        if layer_idx == 0:
            self._seen_tokens += key_states.shape[-2]
@ -1094,7 +1094,7 @@ class StaticCache(Cache):
            should pass the `layer_device_map` argument instead.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
-        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+        layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
@ -1129,7 +1129,7 @@ class StaticCache(Cache):
        max_cache_len: Optional[int] = None,
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.float32,
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
        super().__init__()
        self.max_batch_size = max_batch_size
@ -1145,8 +1145,8 @@ class StaticCache(Cache):
            else config.num_key_value_heads
        )
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        # Note: There will be significant perf decrease if switching to use 5D tensors instead.
        cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        device = torch.device(device) if device is not None else None
@ -1169,8 +1169,8 @@ class StaticCache(Cache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
@ -1182,7 +1182,7 @@ class StaticCache(Cache):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
+            cache_kwargs (`dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
                to know how where to write in the cache.
@ -1260,7 +1260,7 @@ class SlidingWindowCache(StaticCache):
            should pass the `layer_device_map` argument instead.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
-        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+        layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
@ -1294,7 +1294,7 @@ class SlidingWindowCache(StaticCache):
        max_cache_len: Optional[int] = None,
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.float32,
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
            raise ValueError(
@ -1318,8 +1318,8 @@ class SlidingWindowCache(StaticCache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        if cache_kwargs is None:
            cache_kwargs = {}
        cache_position = cache_kwargs.get("cache_position")
@ -1400,7 +1400,7 @@ class EncoderDecoderCache(Cache):
        for layer_idx in range(len(cross_attention_cache.key_cache)):
            self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
-    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
        sequence length.
@ -1422,7 +1422,7 @@ class EncoderDecoderCache(Cache):
        """
        return len(self.self_attention_cache)
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor]]:
+    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
        """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
        legacy_cache = ()
        if len(self.cross_attention_cache) > 0:
@ -1436,7 +1436,7 @@ class EncoderDecoderCache(Cache):
    @classmethod
    def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+        cls, past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
    ) -> "EncoderDecoderCache":
        """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
        cache = cls(
@ -1495,7 +1495,7 @@ class EncoderDecoderCache(Cache):
        self.check_dynamic_cache(self.crop.__name__)
        self.self_attention_cache.crop(maximum_length)
-    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "list[EncoderDecoderCache]":
        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`"""
        self.check_dynamic_cache(self.batch_split.__name__)
@ -1508,7 +1508,7 @@ class EncoderDecoderCache(Cache):
        return out
    @classmethod
-    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: list["EncoderDecoderCache"]) -> "EncoderDecoderCache":
        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
        `generation.utils`"""
        self_attention_cache = DynamicCache()
@ -1569,7 +1569,7 @@ class HybridCache(Cache):
            should pass the `layer_device_map` argument instead.
        dtype (torch.dtype, *optional*, defaults to `torch.float32`):
            The default `dtype` to use when initializing the layer.
-        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+        layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
@ -1603,7 +1603,7 @@ class HybridCache(Cache):
        max_cache_len: Optional[int] = None,
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.float32,
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1634,8 +1634,8 @@ class HybridCache(Cache):
        else:
            self.is_sliding = [False] * config.num_hidden_layers
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        global_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.sliding_window_len, self.head_dim)
        self.sliding_window = min(config.sliding_window, max_cache_len)
@ -1660,8 +1660,8 @@ class HybridCache(Cache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        if cache_kwargs is None:
            cache_kwargs = {}
        cache_position = cache_kwargs.get("cache_position")
@ -1757,7 +1757,7 @@ class HybridChunkedCache(Cache):
            should pass the `layer_device_map` argument instead.
        dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`):
            The default `dtype` to use when initializing the layer.
-        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+        layer_device_map (`Optional[dict[int, Union[str, torch.device, int]]]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
@ -1791,7 +1791,7 @@ class HybridChunkedCache(Cache):
        max_cache_len: Optional[int] = None,
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.bfloat16,
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1811,8 +1811,8 @@ class HybridChunkedCache(Cache):
        else:
            self.is_sliding = [False] * config.num_hidden_layers
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        self.cumulative_length = [0 for _ in range(config.num_hidden_layers)]
    def initialise_cache_layer(self, layer_idx, key_states):
@ -1880,8 +1880,8 @@ class HybridChunkedCache(Cache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        if cache_kwargs is None:
            cache_kwargs = {}
        cache_position = cache_kwargs.get("cache_position")
@ -1968,7 +1968,7 @@ class OffloadedHybridCache(HybridChunkedCache):
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.bfloat16,
        offload_device: Union[str, torch.device] = torch.device("cpu"),
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ):
        super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map)
@ -2121,8 +2121,8 @@ class MambaCache:
        self.ssm_state_size = config.state_size
        self.conv_kernel_size = config.conv_kernel
-        self.conv_states: List[torch.Tensor] = []
+        self.conv_states: list[torch.Tensor] = []
-        self.ssm_states: List[torch.Tensor] = []
+        self.ssm_states: list[torch.Tensor] = []
        device = torch.device(device) if device is not None else None
        for _ in range(config.num_hidden_layers):
            conv_state: torch.Tensor = torch.zeros(
@ -2193,7 +2193,7 @@ class OffloadedStaticCache(StaticCache):
            The default `dtype` to use when initializing the cache.
        offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`):
            The device to offload to. Defaults to CPU.
-        layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
+        layer_device_map (`dict[int, Union[str, torch.device, int]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
@ -2227,7 +2227,7 @@ class OffloadedStaticCache(StaticCache):
        device: Union[str, torch.device],
        dtype: Optional[torch.dtype] = None,
        offload_device: Union[str, torch.device] = torch.device("cpu"),
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
        super(Cache, self).__init__()
@ -2255,8 +2255,8 @@ class OffloadedStaticCache(StaticCache):
        cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim)
        # Create offloaded CPU tensors.
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        for i in range(config.num_hidden_layers):
            # First layer is always on-device.
@ -2268,8 +2268,8 @@ class OffloadedStaticCache(StaticCache):
            self.value_cache.append(value_cache)
        # Create device tensors.
-        self._device_key_cache: List[torch.Tensor] = []
+        self._device_key_cache: list[torch.Tensor] = []
-        self._device_value_cache: List[torch.Tensor] = []
+        self._device_value_cache: list[torch.Tensor] = []
        for i in range(2):
            key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, self.device)
@ -2289,8 +2289,8 @@ class OffloadedStaticCache(StaticCache):
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
+        cache_kwargs: Optional[dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
@ -2302,7 +2302,7 @@ class OffloadedStaticCache(StaticCache):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, *optional*):
+            cache_kwargs (`dict[str, Any]`, *optional*):
                Additional arguments for the cache subclass. The `OffloadedStaticCache` needs the
                `cache_position` input to know how where to write in the cache.
@ -2401,13 +2401,13 @@ class OffloadedStaticCache(StaticCache):
        return self._seen_tokens
    def _create_key_value_cache_tensors(
-        self, shape: Tuple[int, ...], device: torch.device
+        self, shape: tuple[int, ...], device: torch.device
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Creates K/V cache tensors on a device. Pins memory for CPU tensors. Marks them as static
        addresses for non-CPU tensors.
        Args:
-            shape (`Tuple[int, ...]`): Shape.
+            shape (`tuple[int, ...]`): Shape.
            device (`torch.device`): Device.
        Returns:
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@ -23,7 +23,7 @@ from datetime import date
 from itertools import chain
 from pathlib import Path
 from re import Pattern
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 import yaml
@ -148,7 +148,7 @@ def find_indent(line: str) -> int:
    return len(search.groups()[0])
-def parse_module_content(content: str) -> List[str]:
+def parse_module_content(content: str) -> list[str]:
    """
    Parse the content of a module in the list of objects it defines.
@ -156,7 +156,7 @@ def parse_module_content(content: str) -> List[str]:
        content (`str`): The content to parse
    Returns:
-        `List[str]`: The list of objects defined in the module.
+        `list[str]`: The list of objects defined in the module.
    """
    objects = []
    current_object = []
@ -336,7 +336,7 @@ def add_content_to_file(
 def replace_model_patterns(
    text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns
-) -> Tuple[str, str]:
+) -> tuple[str, str]:
    """
    Replace all patterns present in a given text.
@ -414,10 +414,10 @@ def simplify_replacements(replacements):
    "BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed.
    Args:
-        replacements (`List[Tuple[str, str]]`): List of patterns (old, new)
+        replacements (`list[tuple[str, str]]`): List of patterns (old, new)
    Returns:
-        `List[Tuple[str, str]]`: The list of patterns simplified.
+        `list[tuple[str, str]]`: The list of patterns simplified.
    """
    if len(replacements) <= 1:
        # Nothing to simplify
@ -519,7 +519,7 @@ def duplicate_module(
    new_model_patterns: ModelPatterns,
    dest_file: Optional[str] = None,
    add_copied_from: bool = True,
-    attrs_to_remove: Optional[List[str]] = None,
+    attrs_to_remove: Optional[list[str]] = None,
 ):
    """
    Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
@ -585,17 +585,17 @@ def duplicate_module(
 def filter_framework_files(
-    files: List[Union[str, os.PathLike]], frameworks: Optional[List[str]] = None
+    files: list[Union[str, os.PathLike]], frameworks: Optional[list[str]] = None
-) -> List[Union[str, os.PathLike]]:
+) -> list[Union[str, os.PathLike]]:
    """
    Filter a list of files to only keep the ones corresponding to a list of frameworks.
    Args:
-        files (`List[Union[str, os.PathLike]]`): The list of files to filter.
+        files (`list[Union[str, os.PathLike]]`): The list of files to filter.
-        frameworks (`List[str]`, *optional*): The list of allowed frameworks.
+        frameworks (`list[str]`, *optional*): The list of allowed frameworks.
    Returns:
-        `List[Union[str, os.PathLike]]`: The list of filtered files.
+        `list[Union[str, os.PathLike]]`: The list of filtered files.
    """
    if frameworks is None:
        frameworks = get_default_frameworks()
@ -617,17 +617,17 @@ def filter_framework_files(
    return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others
-def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, Union[Path, List[Path]]]:
+def get_model_files(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, Union[Path, list[Path]]]:
    """
    Retrieves all the files associated to a model.
    Args:
        model_type (`str`): A valid model type (like "bert" or "gpt2")
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            If passed, will only keep the model files corresponding to the passed frameworks.
    Returns:
-        `Dict[str, Union[Path, List[Path]]]`: A dictionary with the following keys:
+        `dict[str, Union[Path, list[Path]]]`: A dictionary with the following keys:
        - **doc_file** -- The documentation file for the model.
        - **model_files** -- All the files in the model module.
        - **test_files** -- The test files for the model.
@ -663,14 +663,14 @@ _re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", fla
 def find_base_model_checkpoint(
-    model_type: str, model_files: Optional[Dict[str, Union[Path, List[Path]]]] = None
+    model_type: str, model_files: Optional[dict[str, Union[Path, list[Path]]]] = None
 ) -> str:
    """
    Finds the model checkpoint used in the docstrings for a given model.
    Args:
        model_type (`str`): A valid model type (like "bert" or "gpt2")
-        model_files (`Dict[str, Union[Path, List[Path]]`, *optional*):
+        model_files (`dict[str, Union[Path, list[Path]]`, *optional*):
            The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed.
    Returns:
@ -713,18 +713,18 @@ def get_default_frameworks():
 _re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES")
-def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, List[str]]:
+def retrieve_model_classes(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, list[str]]:
    """
    Retrieve the model classes associated to a given model.
    Args:
        model_type (`str`): A valid model type (like "bert" or "gpt2")
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict
            the classes returned.
    Returns:
-        `Dict[str, List[str]]`: A dictionary with one key per framework and the list of model classes associated to
+        `dict[str, list[str]]`: A dictionary with one key per framework and the list of model classes associated to
        that framework as values.
    """
    if frameworks is None:
@ -754,20 +754,20 @@ def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = No
    return model_classes
-def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
+def retrieve_info_for_model(model_type, frameworks: Optional[list[str]] = None):
    """
    Retrieves all the information from a given model_type.
    Args:
        model_type (`str`): A valid model type (like "bert" or "gpt2")
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            If passed, will only keep the info corresponding to the passed frameworks.
    Returns:
        `Dict`: A dictionary with the following keys:
-        - **frameworks** (`List[str]`): The list of frameworks that back this model type.
+        - **frameworks** (`list[str]`): The list of frameworks that back this model type.
-        - **model_classes** (`Dict[str, List[str]]`): The model classes implemented for that model type.
+        - **model_classes** (`dict[str, list[str]]`): The model classes implemented for that model type.
-        - **model_files** (`Dict[str, Union[Path, List[Path]]]`): The files associated with that model type.
+        - **model_files** (`dict[str, Union[Path, list[Path]]]`): The files associated with that model type.
        - **model_patterns** (`ModelPatterns`): The various patterns for the model.
    """
    if model_type not in auto_module.MODEL_NAMES_MAPPING:
@ -833,7 +833,7 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
 def clean_frameworks_in_init(
-    init_file: Union[str, os.PathLike], frameworks: Optional[List[str]] = None, keep_processing: bool = True
+    init_file: Union[str, os.PathLike], frameworks: Optional[list[str]] = None, keep_processing: bool = True
 ):
    """
    Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature
@ -841,7 +841,7 @@ def clean_frameworks_in_init(
    Args:
        init_file (`str` or `os.PathLike`): The path to the init to treat.
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
           If passed, this will remove all imports that are subject to a framework not in frameworks
        keep_processing (`bool`, *optional*, defaults to `True`):
            Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports
@ -914,7 +914,7 @@ def clean_frameworks_in_init(
 def add_model_to_main_init(
    old_model_patterns: ModelPatterns,
    new_model_patterns: ModelPatterns,
-    frameworks: Optional[List[str]] = None,
+    frameworks: Optional[list[str]] = None,
    with_processing: bool = True,
 ):
    """
@ -923,7 +923,7 @@ def add_model_to_main_init(
    Args:
        old_model_patterns (`ModelPatterns`): The patterns for the old model.
        new_model_patterns (`ModelPatterns`): The patterns for the new model.
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            If specified, only the models implemented in those frameworks will be added.
        with_processing (`bool`, *optional*, defaults to `True`):
            Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
@ -1068,7 +1068,7 @@ AUTO_CLASSES_PATTERNS = {
 def add_model_to_auto_classes(
-    old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: Dict[str, List[str]]
+    old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: dict[str, list[str]]
 ):
    """
    Add a model to the relevant mappings in the auto module.
@ -1076,7 +1076,7 @@ def add_model_to_auto_classes(
    Args:
        old_model_patterns (`ModelPatterns`): The patterns for the old model.
        new_model_patterns (`ModelPatterns`): The patterns for the new model.
-        model_classes (`Dict[str, List[str]]`): A dictionary framework to list of model classes implemented.
+        model_classes (`dict[str, list[str]]`): A dictionary framework to list of model classes implemented.
    """
    for filename in AUTO_CLASSES_PATTERNS:
        # Extend patterns with all model classes if necessary
@ -1169,7 +1169,7 @@ def duplicate_doc_file(
    old_model_patterns: ModelPatterns,
    new_model_patterns: ModelPatterns,
    dest_file: Optional[Union[str, os.PathLike]] = None,
-    frameworks: Optional[List[str]] = None,
+    frameworks: Optional[list[str]] = None,
 ):
    """
    Duplicate a documentation file and adapts it for a new model.
@ -1180,7 +1180,7 @@ def duplicate_doc_file(
        new_model_patterns (`ModelPatterns`): The patterns for the new model.
        dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
            Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
    """
    with open(doc_file, "r", encoding="utf-8") as f:
@ -1320,7 +1320,7 @@ def create_new_model_like(
    model_type: str,
    new_model_patterns: ModelPatterns,
    add_copied_from: bool = True,
-    frameworks: Optional[List[str]] = None,
+    frameworks: Optional[list[str]] = None,
    old_checkpoint: Optional[str] = None,
    create_fast_image_processor: bool = False,
 ):
@ -1332,7 +1332,7 @@ def create_new_model_like(
        new_model_patterns (`ModelPatterns`): The patterns for the new model.
        add_copied_from (`bool`, *optional*, defaults to `True`):
            Whether or not to add "Copied from" statements to all classes in the new model modeling files.
-        frameworks (`List[str]`, *optional*):
+        frameworks (`list[str]`, *optional*):
            If passed, will limit the duplicate to the frameworks specified.
        old_checkpoint (`str`, *optional*):
            The name of the base checkpoint for the old model. Should be passed along when it can't be automatically
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Optional
+from typing import Any, Optional
 from ..pipelines import Pipeline, get_supported_tasks, pipeline
 from ..utils import logging
@ -69,8 +69,8 @@ class ServeTokenizeResult(BaseModel):
    Tokenize result model
    """
-    tokens: List[str]
+    tokens: list[str]
-    tokens_ids: Optional[List[int]]
+    tokens_ids: Optional[list[int]]
 class ServeDeTokenizeResult(BaseModel):
@ -196,7 +196,7 @@ class ServeCommand(BaseTransformersCLICommand):
    def detokenize(
        self,
-        tokens_ids: List[int] = Body(None, embed=True),
+        tokens_ids: list[int] = Body(None, embed=True),
        skip_special_tokens: bool = Body(False, embed=True),
        cleanup_tokenization_spaces: bool = Body(True, embed=True),
    ):
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -63,13 +63,13 @@ class PretrainedConfig(PushToHubMixin):
      Some configurations requires inputs to be defined at init and have no default values, usually these are composite configs,
      (but not necessarily) such as [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`]. They have to be initialized from
      two or more configs of type [`~transformers.PretrainedConfig`].
-    - **keys_to_ignore_at_inference** (`List[str]`) -- A list of keys to ignore by default when looking at dictionary
+    - **keys_to_ignore_at_inference** (`list[str]`) -- A list of keys to ignore by default when looking at dictionary
      outputs of the model during inference.
-    - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
+    - **attribute_map** (`dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
      naming of attributes.
-    - **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
+    - **base_model_tp_plan** (`dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
      parallel plan applied to the sub-module when `model.tensor_parallel` is called.
-    - **base_model_pp_plan** (`Dict[str, Tuple[List[str]]]`) -- A dict that maps child-modules of a base model to a
+    - **base_model_pp_plan** (`dict[str, tuple[list[str]]]`) -- A dict that maps child-modules of a base model to a
      pipeline parallel plan that enables users to place the child-module on the appropriate device.
    Common attributes (present in all subclasses):
@ -115,7 +115,7 @@ class PretrainedConfig(PushToHubMixin):
        tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
            and decoder model to have the exact same parameter names.
-        prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
+        prune_heads (`dict[int, list[int]]`, *optional*, defaults to `{}`):
            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
            heads to prune in said layer.
@ -128,17 +128,17 @@ class PretrainedConfig(PushToHubMixin):
        > Parameters for fine-tuning tasks
-        architectures (`List[str]`, *optional*):
+        architectures (`list[str]`, *optional*):
            Model architectures that can be used with the model pretrained weights.
        finetuning_task (`str`, *optional*):
            Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
            or PyTorch) checkpoint.
-        id2label (`Dict[int, str]`, *optional*):
+        id2label (`dict[int, str]`, *optional*):
            A map from index (for instance prediction index, or target index) to label.
-        label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
+        label2id (`dict[str, int]`, *optional*): A map from label to index for the model.
        num_labels (`int`, *optional*):
            Number of labels to use in the last layer added to the model, typically for a classification task.
-        task_specific_params (`Dict[str, Any]`, *optional*):
+        task_specific_params (`dict[str, Any]`, *optional*):
            Additional keyword arguments to store for the current task.
        problem_type (`str`, *optional*):
            Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
@ -394,7 +394,7 @@ class PretrainedConfig(PushToHubMixin):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        self._set_token_in_kwargs(kwargs)
@ -505,7 +505,7 @@ class PretrainedConfig(PushToHubMixin):
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
-            proxies (`Dict[str, str]`, *optional*):
+            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
@ -531,7 +531,7 @@ class PretrainedConfig(PushToHubMixin):
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
                by the `return_unused_kwargs` keyword parameter.
@ -599,7 +599,7 @@ class PretrainedConfig(PushToHubMixin):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
        Returns:
-            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
        """
        cls._set_token_in_kwargs(kwargs)
@ -723,10 +723,10 @@ class PretrainedConfig(PushToHubMixin):
        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
        Args:
-            config_dict (`Dict[str, Any]`):
+            config_dict (`dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Additional parameters from which to initialize the configuration object.
        Returns:
@ -816,7 +816,7 @@ class PretrainedConfig(PushToHubMixin):
        Python dictionary.
        Returns:
-            Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
+            dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
        """
        config_dict = self.to_dict()
@ -874,7 +874,7 @@ class PretrainedConfig(PushToHubMixin):
        Serializes this instance to a Python dictionary.
        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        """
        output = copy.deepcopy(self.__dict__)
        if hasattr(self.__class__, "model_type"):
@ -940,7 +940,7 @@ class PretrainedConfig(PushToHubMixin):
        Updates attributes of this class with attributes from `config_dict`.
        Args:
-            config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+            config_dict (`dict[str, Any]`): Dictionary of attributes that should be updated for this class.
        """
        for key, value in config_dict.items():
            setattr(self, key, value)
@ -1163,7 +1163,7 @@ def get_configuration_file(configuration_files: list[str]) -> str:
    Get the configuration file to use for this version of transformers.
    Args:
-        configuration_files (`List[str]`): The list of available configuration files.
+        configuration_files (`list[str]`): The list of available configuration files.
    Returns:
        `str`: The configuration file to use.
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -18,7 +18,7 @@ import warnings
 from collections.abc import Mapping
 from dataclasses import dataclass
 from random import randint
-from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+from typing import Any, Callable, NewType, Optional, Union
 import numpy as np
@ -33,7 +33,7 @@ InputDataClass = NewType("InputDataClass", Any)
 A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
 of PyTorch/TensorFlow tensors or NumPy arrays.
 """
-DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
+DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])
 class DataCollatorMixin:
@ -72,7 +72,7 @@ def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
    return padded
-def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
+def default_data_collator(features: list[InputDataClass], return_tensors="pt") -> dict[str, Any]:
    """
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:
@ -119,13 +119,13 @@ class DefaultDataCollator(DataCollatorMixin):
    return_tensors: str = "pt"
-    def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]], return_tensors=None) -> dict[str, Any]:
        if return_tensors is None:
            return_tensors = self.return_tensors
        return default_data_collator(features, return_tensors)
-def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
    import torch
    if not isinstance(features[0], Mapping):
@ -161,7 +161,7 @@ def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any
    return batch
-def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
    import tensorflow as tf
    if not isinstance(features[0], Mapping):
@ -202,7 +202,7 @@ def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
    return batch
-def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
    if not isinstance(features[0], Mapping):
        features = [vars(f) for f in features]
    first = features[0]
@ -268,7 +268,7 @@ class DataCollatorWithPadding:
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
        batch = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            features,
@ -569,7 +569,7 @@ class DataCollatorForMultipleChoice(DataCollatorMixin):
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"
-    def torch_call(self, examples: List[Dict[str, Any]]):  # Refactored implementation from the docs.
+    def torch_call(self, examples: list[dict[str, Any]]):  # Refactored implementation from the docs.
        import torch
        # Take labels out of the examples beforehand, because they aren't nested.
@ -911,7 +911,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
    def tf_mask_tokens(
        self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
-    ) -> Tuple[Any, Any]:
+    ) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
@ -956,7 +956,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
        return inputs, labels
-    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        import tensorflow as tf
        if self.seed and self.generator is None:
@ -1002,7 +1002,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
            batch["labels"] = labels
        return batch
-    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if self.seed and self.generator is None:
@ -1032,7 +1032,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
            batch["labels"] = labels
        return batch
-    def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+    def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
@ -1081,7 +1081,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
        return inputs, labels
-    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if self.seed and self.generator is None:
@ -1111,7 +1111,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
            batch["labels"] = labels
        return batch
-    def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+    def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
@ -1193,7 +1193,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
    </Tip>"""
-    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if self.seed and self.generator is None:
            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
            # If no seed supplied, we will use the global RNG
@ -1226,7 +1226,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
        return {"input_ids": inputs, "labels": labels}
-    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        import tensorflow as tf
        if self.seed and self.generator is None:
@ -1261,7 +1261,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
        return {"input_ids": inputs, "labels": labels}
-    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if self.seed and self.generator is None:
            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
            # If no seed supplied, we will use the global RNG
@ -1318,7 +1318,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
            self.generator.shuffle(cand_indexes)
            return cand_indexes
-    def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
+    def _whole_word_mask(self, input_tokens: list[str], max_predictions=512):
        """
        Get 0/1 labels for masked tokens with whole word mask proxy
        """
@ -1358,7 +1358,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
        return mask_labels
-    def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+    def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1414,7 +1414,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        # The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
        return inputs, labels
-    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1474,7 +1474,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
        return inputs, labels
-    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
@ -1564,7 +1564,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
            FutureWarning,
        )
-    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
        import torch
        from torch.nn.utils.rnn import pad_sequence
@ -1587,7 +1587,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
            "sentence_order_label": sentence_order_label,
        }
-    def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
+    def mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any]:
        """
        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
        original. N-gram not applied yet.
@ -1645,28 +1645,28 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
    max_span_length: int = 5  # maximum length of a span of masked tokens
    return_tensors: str = "pt"
-    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _torch_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
-    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _tf_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
-    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _numpy_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
-    def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+    def torch_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
@ -1765,7 +1765,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        return inputs.long(), perm_mask, target_mapping, labels.long()
-    def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+    def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
@ -1872,7 +1872,7 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
        return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
-    def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+    def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
        """
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -17,7 +17,7 @@ import time
 import warnings
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import List, Optional, Union
+from typing import Optional, Union
 import torch
 from filelock import FileLock
@ -75,7 +75,7 @@ class GlueDataset(Dataset):
    args: GlueDataTrainingArguments
    output_mode: str
-    features: List[InputFeatures]
+    features: list[InputFeatures]
    def __init__(
        self,
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@ -18,7 +18,7 @@ import pickle
 import random
 import time
 import warnings
-from typing import Dict, List, Optional
+from typing import Optional
 import torch
 from filelock import FileLock
@ -139,7 +139,7 @@ class LineByLineTextDataset(Dataset):
    def __len__(self):
        return len(self.examples)
-    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
        return self.examples[i]
@ -187,7 +187,7 @@ class LineByLineWithRefDataset(Dataset):
    def __len__(self):
        return len(self.examples)
-    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
        return self.examples[i]
@ -339,7 +339,7 @@ class LineByLineWithSOPTextDataset(Dataset):
    def __len__(self):
        return len(self.examples)
-    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
        return self.examples[i]
@ -433,7 +433,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
                )
-    def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
+    def create_examples_from_document(self, document: list[list[int]], doc_index: int, block_size: int):
        """Creates examples for a single document."""
        max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@ -16,7 +16,7 @@ import os
 import time
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 import torch
 from filelock import FileLock
@ -112,7 +112,7 @@ class SquadDataset(Dataset):
    """
    args: SquadDataTrainingArguments
-    features: List[SquadFeatures]
+    features: list[SquadFeatures]
    mode: Split
    is_language_sensitive: bool
@ -195,7 +195,7 @@ class SquadDataset(Dataset):
    def __len__(self):
        return len(self.features)
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+    def __getitem__(self, i) -> dict[str, torch.Tensor]:
        # Convert to Tensors and build dataset
        feature = self.features[i]
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@ -19,7 +19,7 @@ import os
 import warnings
 from dataclasses import asdict
 from enum import Enum
-from typing import List, Optional, Union
+from typing import Optional, Union
 from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import is_tf_available, logging
@ -39,7 +39,7 @@ DEPRECATION_WARNING = (
 def glue_convert_examples_to_features(
-    examples: Union[List[InputExample], "tf.data.Dataset"],
+    examples: Union[list[InputExample], "tf.data.Dataset"],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
@ -107,7 +107,7 @@ if is_tf_available():
 def _glue_convert_examples_to_features(
-    examples: List[InputExample],
+    examples: list[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@ -18,7 +18,7 @@ import csv
 import dataclasses
 import json
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import Optional, Union
 from ...utils import is_tf_available, is_torch_available, logging
@ -67,9 +67,9 @@ class InputFeatures:
            float for regression problems.
    """
-    input_ids: List[int]
+    input_ids: list[int]
-    attention_mask: Optional[List[int]] = None
+    attention_mask: Optional[list[int]] = None
-    token_type_ids: Optional[List[int]] = None
+    token_type_ids: Optional[list[int]] = None
    label: Optional[Union[int, float]] = None
    def to_json_string(self):
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@ -136,7 +136,7 @@ class DebugUnderflowOverflow:
            The model to debug.
        max_frames_to_save (`int`, *optional*, defaults to 21):
            How many frames back to record
-        trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
+        trace_batch_nums(`list[int]`, *optional*, defaults to `[]`):
            Which batch numbers to trace (turns detection off)
        abort_after_batch_num  (`int``, *optional*):
            Whether to abort after a certain batch number has finished
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@ -317,7 +317,7 @@ def get_cached_module_file(
        resume_download:
            Deprecated and ignored. All downloads are now resumed by default when possible.
            Will be removed in v5 of Transformers.
-        proxies (`Dict[str, str]`, *optional*):
+        proxies (`dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
@ -507,7 +507,7 @@ def get_class_from_dynamic_module(
        resume_download:
            Deprecated and ignored. All downloads are now resumed by default when possible.
            Will be removed in v5 of Transformers.
-        proxies (`Dict[str, str]`, *optional*):
+        proxies (`dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or `bool`, *optional*):
@ -593,7 +593,7 @@ def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Option
            A config in which to register the auto_map corresponding to this custom object.
    Returns:
-        `List[str]`: The list of files saved.
+        `list[str]`: The list of files saved.
    """
    if obj.__module__ == "__main__":
        logger.warning(
@ -762,7 +762,7 @@ def check_python_requirements(path_or_repo_id, requirements_file="requirements.t
            This can be either:
            - a string, the *model id* of a model repo on huggingface.co.
            - a path to a *directory* potentially containing the file.
-        kwargs (`Dict[str, Any]`, *optional*):
+        kwargs (`dict[str, Any]`, *optional*):
            Additional arguments to pass to `cached_file`.
    """
    failed = []  # error messages regarding requirements
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@ -81,13 +81,13 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
        </Tip>
        Args:
-            processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
+            processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
-                Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of
+                Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
-                input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str,
+                input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
-                List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
+                list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.
-                Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                Instead of `list[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                see the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
@ -235,9 +235,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
        Pad inputs (on left/right and up to predefined length or max length in the batch)
        Args:
-            processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
+            processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
-                Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
+                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
-                of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
+                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see below)
            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
@ -306,9 +306,9 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
        Truncate inputs to predefined length or max length in the batch
        Args:
-            processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
+            processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
-                Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
+                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
-                of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
+                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                maximum length of the returned list and optionally padding length (see below)
            pad_to_multiple_of (`int`, *optional*) :
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -303,7 +303,7 @@ class FeatureExtractionMixin(PushToHubMixin):
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
-            proxies (`Dict[str, str]`, *optional*):
+            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
@ -326,7 +326,7 @@ class FeatureExtractionMixin(PushToHubMixin):
                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are feature extractor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
                controlled by the `return_unused_kwargs` keyword parameter.
@ -392,7 +392,7 @@ class FeatureExtractionMixin(PushToHubMixin):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        use_auth_token = kwargs.pop("use_auth_token", None)
@ -454,7 +454,7 @@ class FeatureExtractionMixin(PushToHubMixin):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
        Returns:
-            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
@ -555,11 +555,11 @@ class FeatureExtractionMixin(PushToHubMixin):
        parameters.
        Args:
-            feature_extractor_dict (`Dict[str, Any]`):
+            feature_extractor_dict (`dict[str, Any]`):
                Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Additional parameters from which to initialize the feature extractor object.
        Returns:
@ -588,7 +588,7 @@ class FeatureExtractionMixin(PushToHubMixin):
    def to_dict(self) -> dict[str, Any]:
        """
        Serializes this instance to a Python dictionary. Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        """
        output = copy.deepcopy(self.__dict__)
        output["feature_extractor_type"] = self.__class__.__name__
--- a/src/transformers/generation/beam_constraints.py
+++ b/src/transformers/generation/beam_constraints.py
@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import Optional
 class Constraint(ABC):
@ -51,7 +51,7 @@ class Constraint(ABC):
        When called, returns the token(s) that would take this constraint one step closer to being fulfilled.
        Return:
-            token_ids (Union[int, List[int], None]):
+            token_ids (Union[int, list[int], None]):
                - A single token ID (int) that advances the constraint, or
                - A list of token IDs that could advance the constraint
                - None if the constraint is completed or cannot be advanced
@ -134,11 +134,11 @@ class PhrasalConstraint(Constraint):
    [`Constraint`] enforcing that an ordered sequence of tokens is included in the output.
    Args:
-        token_ids (`List[int]`):
+        token_ids (`list[int]`):
            The id of the token that must be generated by the output.
    """
-    def __init__(self, token_ids: List[int]):
+    def __init__(self, token_ids: list[int]):
        super(Constraint, self).__init__()
        if not isinstance(token_ids, list) or len(token_ids) == 0:
@ -205,7 +205,7 @@ class PhrasalConstraint(Constraint):
 class DisjunctiveTrie:
-    def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
+    def __init__(self, nested_token_ids: list[list[int]], no_subsets=True):
        r"""
        A helper class that builds a trie with the words represented in `nested_token_ids`.
        """
@ -266,12 +266,12 @@ class DisjunctiveConstraint(Constraint):
    A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
    Args:
-        nested_token_ids (`List[List[int]]`):
+        nested_token_ids (`list[list[int]]`):
            A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
            the list of words.
    """
-    def __init__(self, nested_token_ids: List[List[int]]):
+    def __init__(self, nested_token_ids: list[list[int]]):
        super(Constraint, self).__init__()
        if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0:
@ -356,11 +356,11 @@ class ConstraintListState:
    A class for beam scorers to track its progress through a list of constraints.
    Args:
-        constraints (`List[Constraint]`):
+        constraints (`list[Constraint]`):
            A list of [`Constraint`] objects that must be fulfilled by the beam scorer.
    """
-    def __init__(self, constraints: List[Constraint]):
+    def __init__(self, constraints: list[Constraint]):
        self.constraints = constraints
        # max # of steps required to fulfill a given constraint
@ -418,7 +418,7 @@ class ConstraintListState:
        else:
            return token_list
-    def reset(self, token_ids: Optional[List[int]]):
+    def reset(self, token_ids: Optional[list[int]]):
        """
        token_ids: the tokens generated thus far to reset the state of the progress through constraints.
        """
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@ -15,7 +15,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 import numpy as np
 import torch
@ -41,7 +41,7 @@ PROCESS_INPUTS_DOCSTRING = r"""
            Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
        pad_token_id (`int`, *optional*):
            The id of the *padding* token.
-        eos_token_id (`Union[int, List[int]]`, *optional*):
+        eos_token_id (`Union[int, list[int]]`, *optional*):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        beam_indices (`torch.LongTensor`, *optional*):
            Beam indices indicating to which beam hypothesis each token correspond.
@ -77,7 +77,7 @@ FINALIZE_INPUTS_DOCSTRING = r"""
            The beam indices indicating to which beam the `final_beam_tokens` shall be added.
        pad_token_id (`int`, *optional*):
            The id of the *padding* token.
-        eos_token_id (`Union[int, List[int]]`, *optional*):
+        eos_token_id (`Union[int, list[int]]`, *optional*):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
    Return:
@ -103,7 +103,7 @@ class BeamScorer(ABC):
        next_tokens: torch.LongTensor,
        next_indices: torch.LongTensor,
        **kwargs,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        raise NotImplementedError("This is an abstract method.")
    @abstractmethod
@ -219,11 +219,11 @@ class BeamSearchScorer(BeamScorer):
        next_tokens: torch.LongTensor,
        next_indices: torch.LongTensor,
        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
        beam_indices: Optional[torch.LongTensor] = None,
        group_index: Optional[int] = 0,
        decoder_prompt_len: Optional[int] = 0,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
        # add up to the length which the next_scores is calculated on (including decoder prompt)
        cur_len = input_ids.shape[-1] + 1
        batch_size = len(self._beam_hyps) // self.num_beam_groups
@ -325,10 +325,10 @@ class BeamSearchScorer(BeamScorer):
        final_beam_indices: torch.LongTensor,
        max_length: int,
        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
        beam_indices: Optional[torch.LongTensor] = None,
        decoder_prompt_len: Optional[int] = 0,
-    ) -> Tuple[torch.LongTensor]:
+    ) -> tuple[torch.LongTensor]:
        batch_size = len(self._beam_hyps) // self.num_beam_groups
        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
@ -426,7 +426,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
        num_beams (`int`):
            Number of beams for beam search.
-        constraints (`List[Constraint]`):
+        constraints (`list[Constraint]`):
            A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation
            output. For more information, the documentation of [`Constraint`] should be read.
        device (`torch.device`):
@ -457,7 +457,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
        self,
        batch_size: int,
        num_beams: int,
-        constraints: List[Constraint],
+        constraints: list[Constraint],
        device: torch.device,
        length_penalty: Optional[float] = 1.0,
        do_early_stopping: Optional[Union[bool, str]] = False,
@ -518,10 +518,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
        next_indices: torch.LongTensor,
        scores_for_all_vocab: torch.FloatTensor,
        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
        beam_indices: Optional[torch.LongTensor] = None,
        decoder_prompt_len: Optional[int] = 0,
-    ) -> Tuple[torch.Tensor]:
+    ) -> tuple[torch.Tensor]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
@ -541,7 +541,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
                The scores of all tokens in the vocabulary for each of the beam hypotheses.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
+            eos_token_id (`Union[int, list[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices indicating to which beam hypothesis each token correspond.
@ -818,10 +818,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
        final_beam_indices: torch.LongTensor,
        max_length: int,
        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
        beam_indices: Optional[torch.LongTensor] = None,
        decoder_prompt_len: Optional[int] = 0,
-    ) -> Tuple[torch.LongTensor]:
+    ) -> tuple[torch.LongTensor]:
        batch_size = len(self._beam_hyps)
        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@ -15,7 +15,7 @@
 import copy
 import weakref
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 import numpy as np
 import torch
@ -44,7 +44,7 @@ from ..utils.deprecation import deprecate_kwarg
 class CandidateGenerator:
    """Abstract base class for all candidate generators that can be applied during assisted generation."""
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """
        Fetches the candidates to be tried for the current input.
@ -108,7 +108,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
        input_ids: torch.LongTensor,
        assistant_model: "PreTrainedModel",
        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
+        model_kwargs: dict,
        inputs_tensor: Optional[torch.Tensor] = None,
        logits_processor: "LogitsProcessorList" = None,
    ):
@ -198,7 +198,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
            self.probs = []
            self.matches = []
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """
        Fetches the candidates to be tried for the current input.
@ -281,7 +281,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
                self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold
-    def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
+    def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> tuple[int, int]:
        """Calculate the minimum and maximum number of new tokens to generate."""
        new_cur_len = input_ids.shape[-1]
        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
@ -305,7 +305,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
        return has_past_key_values
-    def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
+    def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> dict:
        """Prepare arguments for the generation call."""
        return {
            self.input_ids_key: input_ids,
@ -315,7 +315,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
            "logits_processor": self.logits_processor,
        }
-    def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def _generate_candidates(self, generation_args: dict) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """Generate candidate sequences using the assistant model."""
        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
@ -374,7 +374,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
        target_tokenizer: "PreTrainedTokenizerBase",
        assistant_tokenizer: "PreTrainedTokenizerBase",
        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
+        model_kwargs: dict,
        inputs_tensor: Optional[torch.Tensor] = None,
        logits_processor: "LogitsProcessorList" = None,
    ):
@ -495,7 +495,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
        dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
        return dest_ids.to(input_ids.device)
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """
        Fetches the candidates to be tried for the current input.
@ -537,7 +537,7 @@ class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
        return new_target_ids, None
-    def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]:
+    def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, int]:
        """Converts target input IDs to assistant input IDs, handling discrepancies."""
        convert_kwargs = {
            "source_tokenizer": self.target_tokenizer,
@ -782,7 +782,7 @@ class AssistantToTargetTranslator:
        max_assistant_index = max(assistant_vocab.values())
        assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.SUPPRESS_TOKEN_ID, dtype=int)
-        target_to_assistant_input_ids: Dict[int, int] = {}
+        target_to_assistant_input_ids: dict[int, int] = {}
        for tok, assistant_id in assistant_vocab.items():
            target_id = target_vocab.get(tok)
            if target_id is not None:
@ -909,7 +909,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
        target_tokenizer: "PreTrainedTokenizerBase",
        assistant_tokenizer: "PreTrainedTokenizerBase",
        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
+        model_kwargs: dict,
        atm_translator: AssistantToTargetTranslator,
        inputs_tensor: Optional[torch.Tensor] = None,
        logits_processor: "LogitsProcessorList" = None,
@ -930,7 +930,7 @@ class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentT
        self._target_seq_len_with_candidates: int = 0
        self._prev_assistant_ids: Optional[torch.LongTensor] = None
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """
        Simplified version of get_candidates that uses the translator cache for token conversion.
        """
@ -1043,7 +1043,7 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
        if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
            raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        """
        Fetches the candidates to be tried for the current input.
@ -1153,7 +1153,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
        input_ids: torch.LongTensor,
        assistant_model: "PreTrainedModel",
        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
+        model_kwargs: dict,
        inputs_tensor: Optional[torch.Tensor] = None,
        logits_processor: "LogitsProcessorList" = None,
    ):
@ -1170,7 +1170,7 @@ class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
        self.assistant_early_exit = self.generation_config.assistant_early_exit
        self.generation_config.assistant_early_exit = None
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        # Temporarily sets the number of hidden layers to the early exit value
        base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
        original_num_hidden_layers = base_model.config.num_hidden_layers
@ -1221,7 +1221,7 @@ def _crop_past_key_values(model, past_key_values, max_length):
    return past_key_values
-def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_encoder_decoder: bool) -> Dict[str, Any]:
+def _prepare_attention_mask(model_kwargs: dict[str, Any], new_length: int, is_encoder_decoder: bool) -> dict[str, Any]:
    """Expands or crops the model's mask for decoding purposes, to the defined length"""
    mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
@ -1257,7 +1257,7 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
    return model_kwargs
-def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
+def _prepare_token_type_ids(model_kwargs: dict[str, Any], new_length: int) -> dict[str, Any]:
    """Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
    if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
        return model_kwargs
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -20,7 +20,7 @@ import os
 import warnings
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, is_dataclass
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 from .. import __version__
 from ..configuration_utils import PretrainedConfig
@ -149,7 +149,7 @@ class GenerationConfig(PushToHubMixin):
        max_time (`float`, *optional*):
            The maximum amount of time you allow the computation to run for in seconds. generation will still finish
            the current pass after allocated time has been passed.
-        stop_strings (`str or List[str]`, *optional*):
+        stop_strings (`str or list[str]`, *optional*):
            A string or a list of strings that should terminate generation if the model outputs them.
        > Parameters that control the generation strategy used
@ -163,7 +163,7 @@ class GenerationConfig(PushToHubMixin):
            [this paper](https://huggingface.co/papers/1610.02424) for more details.
        penalty_alpha (`float`, *optional*):
            The values balance the model confidence and the degeneration penalty in contrastive search decoding.
-        dola_layers (`str` or `List[int]`, *optional*):
+        dola_layers (`str` or `list[int]`, *optional*):
            The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
            be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
            "low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
@ -245,26 +245,26 @@ class GenerationConfig(PushToHubMixin):
            `length_penalty` < 0.0 encourages shorter sequences.
        no_repeat_ngram_size (`int`, *optional*, defaults to 0):
            If set to int > 0, all ngrams of that size can only occur once.
-        bad_words_ids (`List[List[int]]`, *optional*):
+        bad_words_ids (`list[list[int]]`, *optional*):
            List of list of token ids that are not allowed to be generated. Check
            [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
-        force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*):
+        force_words_ids (`list[list[int]]` or `list[list[list[int]]]`, *optional*):
-            List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
+            List of token ids that must be generated. If given a `list[list[int]]`, this is treated as a simple list of
-            words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
+            words that must be included, the opposite to `bad_words_ids`. If given `list[list[list[int]]]`, this
            triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
            can allow different forms of each word.
        renormalize_logits (`bool`, *optional*, defaults to `False`):
            Whether to renormalize the logits after applying all the logits processors (including the custom
            ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
            are normalized but some logit processors break the normalization.
-        constraints (`List[Constraint]`, *optional*):
+        constraints (`list[Constraint]`, *optional*):
            Custom constraints that can be added to the generation to ensure that the output will contain the use of
            certain tokens as defined by `Constraint` objects, in the most sensible way possible.
        forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
            language token.
-        forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
+        forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
            The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
            list to set multiple *end-of-sequence* tokens.
        remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
@ -274,13 +274,13 @@ class GenerationConfig(PushToHubMixin):
            This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
            generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
            penalty starts and `decay_factor` represents the factor of exponential decay
-        suppress_tokens (`List[int]`, *optional*):
+        suppress_tokens (`list[int]`, *optional*):
            A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
            log probs to `-inf` so that they are not sampled.
-        begin_suppress_tokens  (`List[int]`, *optional*):
+        begin_suppress_tokens  (`list[int]`, *optional*):
            A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit
            processor will set their log probs to `-inf` so that they are not sampled.
-        sequence_bias (`Dict[Tuple[int], float]`, *optional*)):
+        sequence_bias (`dict[tuple[int], float]`, *optional*)):
            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
            sequence being selected, while negative biases do the opposite. Check
            [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
@ -325,7 +325,7 @@ class GenerationConfig(PushToHubMixin):
            The id of the *padding* token.
        bos_token_id (`int`, *optional*):
            The id of the *beginning-of-sequence* token.
-        eos_token_id (`Union[int, List[int]]`, *optional*):
+        eos_token_id (`Union[int, list[int]]`, *optional*):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        > Generation parameters exclusive to encoder-decoder models
@ -333,7 +333,7 @@ class GenerationConfig(PushToHubMixin):
        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
            If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
            `decoder_input_ids`.
-        decoder_start_token_id (`int` or `List[int]`, *optional*):
+        decoder_start_token_id (`int` or `list[int]`, *optional*):
            If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
            `batch_size`. Indicating a list enables different start ids for each element in the batch
            (e.g. multilingual models with different target languages in one batch)
@ -846,7 +846,7 @@ class GenerationConfig(PushToHubMixin):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
@ -933,7 +933,7 @@ class GenerationConfig(PushToHubMixin):
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
-            proxies (`Dict[str, str]`, *optional*):
+            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
@ -959,7 +959,7 @@ class GenerationConfig(PushToHubMixin):
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
                by the `return_unused_kwargs` keyword parameter.
@ -1090,14 +1090,14 @@ class GenerationConfig(PushToHubMixin):
        return json.loads(text)
    @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "GenerationConfig":
        """
        Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
        Args:
-            config_dict (`Dict[str, Any]`):
+            config_dict (`dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object.
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Additional parameters from which to initialize the configuration object.
        Returns:
@ -1123,7 +1123,7 @@ class GenerationConfig(PushToHubMixin):
        else:
            return config
-    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+    def dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
        """
        Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
@ -1135,13 +1135,13 @@ class GenerationConfig(PushToHubMixin):
            if isinstance(value, dict):
                self.dict_torch_dtype_to_str(value)
-    def to_diff_dict(self) -> Dict[str, Any]:
+    def to_diff_dict(self) -> dict[str, Any]:
        """
        Removes all attributes from config which correspond to the default config attributes for better readability and
        serializes to a Python dictionary.
        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        config_dict = self.to_dict()
@ -1158,12 +1158,12 @@ class GenerationConfig(PushToHubMixin):
        self.dict_torch_dtype_to_str(serializable_config_dict)
        return serializable_config_dict
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
        """
        Serializes this instance to a Python dictionary.
        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        """
        output = copy.deepcopy(self.__dict__)
@ -1289,11 +1289,11 @@ class GenerationConfig(PushToHubMixin):
        returning all the unused kwargs.
        Args:
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Dictionary of attributes to tentatively update this class.
        Returns:
-            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+            `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
        """
        to_remove = []
        for key, value in kwargs.items():
@ -1319,7 +1319,7 @@ class BaseWatermarkingConfig(ABC):
        Constructs a BaseWatermarkingConfig instance from a dictionary of parameters.
        Args:
-            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
+            config_dict (dict[str, Any]): Dictionary containing configuration parameters.
            **kwargs: Additional keyword arguments to override dictionary values.
        Returns:
@ -1348,12 +1348,12 @@ class BaseWatermarkingConfig(ABC):
            writer.write(json_string)
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
        """
        Serializes this instance to a Python dictionary.
        Returns:
-            Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
+            dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
        """
        output = copy.deepcopy(self.__dict__)
        return output
@ -1479,7 +1479,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
    Args:
        ngram_len (`int`):
            Ngram length.
-        keys (`List[int]`):
+        keys (`list[int]`):
            A sequence of watermarking keys, one for each depth.
        context_history_size (`int`, *optional*, defaults to 1024):
            Size of the tensor to keep track of seen contexts.
@ -1518,7 +1518,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
    def __init__(
        self,
        ngram_len: int,
-        keys: List[int],
+        keys: list[int],
        context_history_size: int = 1024,
        sampling_table_seed: int = 0,
        sampling_table_size: int = 2**16,
@ -1605,6 +1605,6 @@ class CompileConfig:
    # Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
    _compile_all_devices = None
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
        """Serializes this instance to a Python dictionary."""
        return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})
--- a/src/transformers/generation/continuous_batching.py
+++ b/src/transformers/generation/continuous_batching.py
@ -23,7 +23,7 @@ from collections import deque
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import partial
-from typing import Deque, Dict, List, Optional, Set, Tuple, Union
+from typing import Optional, Union
 import torch
 import torch.nn as nn
@ -59,16 +59,16 @@ class GenerationOutput:
    Attributes:
        request_id (str): The ID of the generation request.
-        prompt_ids (List[int]): The IDs of the prompt tokens.
+        prompt_ids (list[int]): The IDs of the prompt tokens.
-        generated_tokens (List[int]): The generated tokens.
+        generated_tokens (list[int]): The generated tokens.
-        logprobs (List[float]): The log probabilities of the generated tokens.
+        logprobs (list[float]): The log probabilities of the generated tokens.
        error (Optional[str]): Any error message associated with the request. When None, the request was successful.
    """
    request_id: str
-    prompt_ids: List[int] = field(default_factory=list)
+    prompt_ids: list[int] = field(default_factory=list)
-    generated_tokens: List[int] = field(default_factory=list)
+    generated_tokens: list[int] = field(default_factory=list)
-    logprobs: List[float] = field(default_factory=list)
+    logprobs: list[float] = field(default_factory=list)
    error: Optional[str] = None
    status: RequestStatus = RequestStatus.PENDING
    created_time: float = field(default_factory=time.time)
@ -85,11 +85,11 @@ class RequestState:
    # Required fields
    request_id: str
-    prompt_ids: Optional[List[int]] = None  # the one being processed
+    prompt_ids: Optional[list[int]] = None  # the one being processed
-    full_prompt_ids: Optional[List[int]] = None  # the full prompt
+    full_prompt_ids: Optional[list[int]] = None  # the full prompt
-    remaining_prompt_ids: List[int] = field(default_factory=list)  # For split requests
+    remaining_prompt_ids: list[int] = field(default_factory=list)  # For split requests
-    static_outputs: List[int] = field(default_factory=list)
+    static_outputs: list[int] = field(default_factory=list)
-    allocated_blocks: List[int] = field(default_factory=list)
+    allocated_blocks: list[int] = field(default_factory=list)
    position_offset: int = 0  # Current position in the sequence for position_ids
    status: RequestStatus = RequestStatus.PENDING
    max_new_tokens: int = 20
@ -150,8 +150,8 @@ class PagedAttentionCache(Cache):
        generation_config: GenerationConfig,
        device: torch.device,
        dtype: torch.dtype = torch.float16,
-        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
-        initial_prompt_shapes: Optional[List[List[int]]] = None,
+        initial_prompt_shapes: Optional[list[list[int]]] = None,
    ) -> None:
        """Initialize a paged attention cache for efficient memory usage.
@ -191,8 +191,8 @@ class PagedAttentionCache(Cache):
        self.dtype = dtype
        self.device = device
-        self.key_cache: List[torch.Tensor] = []
+        self.key_cache: list[torch.Tensor] = []
-        self.value_cache: List[torch.Tensor] = []
+        self.value_cache: list[torch.Tensor] = []
        for idx in range(config.num_hidden_layers):
            layer_device = layer_device_map[idx] if layer_device_map is not None else device
            new_layer_key_cache = torch.zeros(self.cache_shape, dtype=self.dtype, device=layer_device)
@ -206,10 +206,10 @@ class PagedAttentionCache(Cache):
        # Block management data structures
        self._free_blocks = deque(range(num_blocks))
-        self._block_tables: Dict[str, List[int]] = {}
+        self._block_tables: dict[str, list[int]] = {}
    @traced
-    def allocate_blocks(self, n_blocks: int, request_id: str) -> List[int]:
+    def allocate_blocks(self, n_blocks: int, request_id: str) -> list[int]:
        """Allocates n_blocks for a given request_id."""
        if len(self._free_blocks) < n_blocks:
            return False
@ -236,12 +236,12 @@ class PagedAttentionCache(Cache):
        """Returns the number of free blocks available."""
        return len(self._free_blocks)
-    def get_block_table(self, request_id: str) -> List[int]:
+    def get_block_table(self, request_id: str) -> list[int]:
        """Returns the block table for a request."""
        return self._block_tables.get(request_id, [])
    @traced
-    def _get_physical_indices(self, state: RequestState, logical_indices: List[int]) -> List[int]:
+    def _get_physical_indices(self, state: RequestState, logical_indices: list[int]) -> list[int]:
        """
        Maps logical sequence indices to physical cache indices using the block table, using PyTorch.
@ -289,7 +289,7 @@ class PagedAttentionCache(Cache):
        read_index,
        write_index,
        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Reshape cache for easier indexing
        total_slots = self.num_blocks * self.block_size
        k_cache_flat = self.key_cache[layer_idx].view(self.num_key_value_heads, total_slots, self.head_dim)
@ -306,9 +306,9 @@ class Scheduler(ABC):
    """
    def __init__(self, cache: PagedAttentionCache, retain_cache_on_finish: bool = False):
-        self.active_requests: Dict[str, RequestState] = {}
+        self.active_requests: dict[str, RequestState] = {}
-        self.waiting_requests: Dict[str, RequestState] = {}
+        self.waiting_requests: dict[str, RequestState] = {}
-        self.waiting_requests_order: Deque[str] = deque()
+        self.waiting_requests_order: deque[str] = deque()
        self.cache = cache
        self.retain_cache_on_finish = retain_cache_on_finish
@ -318,7 +318,7 @@ class Scheduler(ABC):
        pass
    @abstractmethod
-    def schedule_batch(self, token_budget: int) -> List[RequestState]:
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
        pass
    @traced
@ -332,7 +332,7 @@ class Scheduler(ABC):
        pass
    @traced
-    def get_active_request_static_outputs(self, request_id: str) -> List[int]:
+    def get_active_request_static_outputs(self, request_id: str) -> list[int]:
        if request_id in self.active_requests:
            return self.active_requests[request_id].static_outputs
        return []
@ -356,7 +356,7 @@ class FIFOScheduler(Scheduler):
    @traced(span_name="prepare_request")
    def _prepare_request_for_processing(
-        self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str]
+        self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
    ):
        """Prepare a request for processing in the current batch."""
        request_tokens = (
@ -395,9 +395,9 @@ class FIFOScheduler(Scheduler):
        self.waiting_requests_order.append(state.request_id)
    @traced
-    def schedule_batch(self, token_budget: int) -> List[RequestState]:
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
-        priority_states: List[RequestState] = []
+        priority_states: list[RequestState] = []
-        second_priority_states: List[RequestState] = []
+        second_priority_states: list[RequestState] = []
        scheduled_requests = []
        for state in self.active_requests.values():
@ -475,7 +475,7 @@ class PrefillFirstScheduler(Scheduler):
    @traced(span_name="prepare_request")
    def _prepare_request_for_processing(
-        self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: Set[str]
+        self, state: RequestState, token_budget: int, request_ids_to_remove_from_waiting: set[str]
    ):
        """Prepare a request for processing in the current batch."""
        request_tokens = (
@ -514,9 +514,9 @@ class PrefillFirstScheduler(Scheduler):
        self.waiting_requests_order.append(state.request_id)
    @traced
-    def schedule_batch(self, token_budget: int) -> List[RequestState]:
+    def schedule_batch(self, token_budget: int) -> list[RequestState]:
-        priority_states: List[RequestState] = []
+        priority_states: list[RequestState] = []
-        second_priority_states: List[RequestState] = []
+        second_priority_states: list[RequestState] = []
        scheduled_requests = []
        for state in self.active_requests.values():
@ -581,7 +581,7 @@ def compute_optimal_blocks(
    device: torch.device,
    config: PretrainedConfig,
    generation_config: GenerationConfig,
-    inputs: List[List[int]],
+    inputs: list[list[int]],
    dtype: torch.dtype = torch.bfloat16,
    safety_margin: float = 0.9,
    median_prefill_length: Optional[int] = None,
@ -678,7 +678,7 @@ class PagedAttentionArgs:
    write_index: torch.Tensor
    read_index: torch.Tensor
    logits_indices: torch.Tensor
-    block_tables: Dict[str, List[int]]
+    block_tables: dict[str, list[int]]
    cache: PagedAttentionCache
    use_cache: bool = False
@ -754,7 +754,7 @@ class ContinuousBatchProcessor:
        self.streaming = streaming
        self.manual_eviction = manual_eviction
-        self.requests_in_batch: List[RequestState] = []
+        self.requests_in_batch: list[RequestState] = []
        # Get batch size parameters from generation config
        self._configure_batch_parameters()
@ -1152,7 +1152,7 @@ class ContinuousBatchingManager:
                self._generation_thread = None
    def add_request(
-        self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
+        self, input_ids: list[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
    ) -> str:
        """Add a new generation request to the queue.
@ -1184,7 +1184,7 @@ class ContinuousBatchingManager:
        logger.debug(f"Added request {request_id} to queue.")
        return request_id
-    def add_requests(self, inputs: List[List[int]], **kwargs):
+    def add_requests(self, inputs: list[list[int]], **kwargs):
        for i, input_ids in enumerate(inputs):
            # Assign a predictable request ID for ordering results later
            req_id = f"batch_req_{i}"
@ -1428,11 +1428,11 @@ class ContinuousMixin:
    @torch.inference_mode()
    def generate_batch(
        self,
-        inputs: List[List[int]],
+        inputs: list[list[int]],
        generation_config: Optional[GenerationConfig] = None,
        progress_bar: bool = True,
        **kwargs,
-    ) -> List[List[int]]:
+    ) -> list[list[int]]:
        """Generate sequences for a batch of prompts using continuous batching.
        Args:
@ -1441,7 +1441,7 @@ class ContinuousMixin:
            **kwargs: Additional generation parameters
        Returns:
-            `List[List[int]]`: A list containing the generated sequences (including prompt tokens
+            `list[list[int]]`: A list containing the generated sequences (including prompt tokens
                                if not handled otherwise) for each input prompt, in the same order.
                                Returns an empty list `[]` for requests that failed.
        """
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@ -39,7 +39,7 @@ LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
        scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
            search or log softmax for each vocabulary token when using beam search
-        kwargs (`Dict[str, Any]`, *optional*):
+        kwargs (`dict[str, Any]`, *optional*):
            Additional logits processor specific kwargs.
    Return:
@ -276,7 +276,7 @@ class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
    beginning of the generation.
    Args:
-        begin_suppress_tokens (`List[int]`):
+        begin_suppress_tokens (`list[int]`):
            Tokens to not sample.
        begin_index (`int`):
            Index where the tokens are suppressed.
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@ -19,7 +19,7 @@ import copy
 import inspect
 import warnings
 from functools import partial
-from typing import Any, Dict, Optional, Union
+from typing import Any, Optional, Union
 import flax
 import jax
@ -103,7 +103,7 @@ class GreedyState:
    sequences: jnp.ndarray
    running_token: jnp.ndarray
    is_sent_finished: jnp.ndarray
-    model_kwargs: Dict[str, jnp.ndarray]
+    model_kwargs: dict[str, jnp.ndarray]
@flax.struct.dataclass
@ -113,7 +113,7 @@ class SampleState:
    running_token: jnp.ndarray
    is_sent_finished: jnp.ndarray
    prng_key: jnp.ndarray
-    model_kwargs: Dict[str, jnp.ndarray]
+    model_kwargs: dict[str, jnp.ndarray]
@flax.struct.dataclass
@ -124,7 +124,7 @@ class BeamSearchState:
    sequences: jnp.ndarray
    scores: jnp.ndarray
    is_sent_finished: jnp.ndarray
-    model_kwargs: Dict[str, jnp.ndarray]
+    model_kwargs: dict[str, jnp.ndarray]
 class FlaxGenerationMixin:
@ -173,7 +173,7 @@ class FlaxGenerationMixin:
        batch_size: int,
        decoder_start_token_id: Optional[int] = None,
        bos_token_id: Optional[int] = None,
-        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
    ) -> jnp.ndarray:
        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
            # Only use this arg if not None, otherwise just remove from model_kwargs
@ -249,7 +249,7 @@ class FlaxGenerationMixin:
                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
            raise TypeError(exception_message)
-    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+    def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
        unused_model_args = []
        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
@ -273,7 +273,7 @@ class FlaxGenerationMixin:
        generation_config: Optional[GenerationConfig] = None,
        prng_key: Optional[jnp.ndarray] = None,
        trace: bool = True,
-        params: Optional[Dict[str, jnp.ndarray]] = None,
+        params: Optional[dict[str, jnp.ndarray]] = None,
        logits_processor: Optional[FlaxLogitsProcessorList] = None,
        **kwargs,
    ):
@ -293,13 +293,13 @@ class FlaxGenerationMixin:
            trace (`bool`, *optional*, defaults to `True`):
                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
                considerably slower runtime.
-            params (`Dict[str, jnp.ndarray]`, *optional*):
+            params (`dict[str, jnp.ndarray]`, *optional*):
                Optionally the model parameters can be passed. Can be useful for parallelized generation.
            logits_processor (`FlaxLogitsProcessorList `, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -580,8 +580,8 @@ class FlaxGenerationMixin:
        eos_token_id: Optional[int] = None,
        logits_processor: Optional[FlaxLogitsProcessorList] = None,
        trace: bool = True,
-        params: Optional[Dict[str, jnp.ndarray]] = None,
+        params: Optional[dict[str, jnp.ndarray]] = None,
-        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
    ):
        # init values
        max_length = max_length if max_length is not None else self.generation_config.max_length
@ -668,8 +668,8 @@ class FlaxGenerationMixin:
        logits_processor: Optional[FlaxLogitsProcessorList] = None,
        logits_warper: Optional[FlaxLogitsProcessorList] = None,
        trace: bool = True,
-        params: Optional[Dict[str, jnp.ndarray]] = None,
+        params: Optional[dict[str, jnp.ndarray]] = None,
-        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
    ):
        # init values
        max_length = max_length if max_length is not None else self.generation_config.max_length
@ -765,9 +765,9 @@ class FlaxGenerationMixin:
        early_stopping: Optional[Union[bool, str]] = None,
        logits_processor: Optional[FlaxLogitsProcessorList] = None,
        trace: bool = True,
-        params: Optional[Dict[str, jnp.ndarray]] = None,
+        params: Optional[dict[str, jnp.ndarray]] = None,
        num_return_sequences: Optional[int] = None,
-        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[dict[str, jnp.ndarray]] = None,
    ):
        """
        This beam search function is heavily inspired by Flax's official example:
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -16,7 +16,7 @@
 import inspect
 import math
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Optional, Union
 import numpy as np
 import torch
@ -72,7 +72,7 @@ class LogitsProcessorList(list):
            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
                beam search or log softmax for each vocabulary token when using beam search
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Additional kwargs that are specific to a logits processor.
        Return:
@ -103,7 +103,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
    Args:
        min_length (`int`):
            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
        device (`str`, *optional*, defaults to `"cpu"`):
            The device to allocate the tensors.
@ -134,7 +134,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
    ```
    """
-    def __init__(self, min_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
+    def __init__(self, min_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
        if not isinstance(min_length, int) or min_length < 0:
            raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
@ -167,7 +167,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
            input length.
        min_new_tokens (`int`):
            The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
        device (`str`, *optional*, defaults to `"cpu"`):
            The device to allocate the tensors.
@ -197,7 +197,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
        self,
        prompt_length_to_skip: int,
        min_new_tokens: int,
-        eos_token_id: Union[int, List[int], torch.Tensor],
+        eos_token_id: Union[int, list[int], torch.Tensor],
        device: str = "cpu",
    ):
        for arg_name, arg_value in [
@ -917,7 +917,7 @@ def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
 def _calc_banned_ngram_tokens(
    ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int
-) -> List[Iterable[int]]:
+) -> list[Iterable[int]]:
    """Copied from fairseq for no_repeat_ngram in beam_search"""
    if cur_len + 1 < ngram_size:
        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
@ -1074,7 +1074,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
    </Tip>
    Args:
-        sequence_bias (`List[List[Union[List[int], float]]]`):
+        sequence_bias (`list[list[Union[list[int], float]]]`):
            List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0],
            [[64], -7.5]]`). Positive biases increase the odds of the
            sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
@ -1123,7 +1123,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
    ```
    """
-    def __init__(self, sequence_bias: List[List[Union[List[int], float]]]):
+    def __init__(self, sequence_bias: list[list[Union[list[int], float]]]):
        self.sequence_bias = sequence_bias
        self._validate_arguments()
        self._convert_list_arguments_into_dict()
@ -1250,9 +1250,9 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
    </Tip>
    Args:
-        bad_words_ids (`List[List[int]]`):
+        bad_words_ids (`list[list[int]]`):
            List of list of token ids that are not allowed to be generated.
-        eos_token_id (`Union[int, List[int], torch.Tensor]`, *optional*):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`, *optional*):
            The id(s) of the *end-of-sequence* token.
    Examples:
@ -1291,7 +1291,7 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
    """
    def __init__(
-        self, bad_words_ids: List[List[int]], eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None
+        self, bad_words_ids: list[list[int]], eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None
    ):
        self.bad_word_ids = bad_words_ids
        self._validate_arguments()
@ -1332,7 +1332,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    generation. See [Autoregressive Entity Retrieval](https://huggingface.co/papers/2010.00904) for more information.
    Args:
-        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`):
            This function constraints the beam search to allowed tokens only at each step. This function takes 2
            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
            next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
@ -1373,7 +1373,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    ```
    """
-    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
+    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]], num_beams: int):
        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
        self._num_beams = num_beams
@ -1586,7 +1586,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    Args:
        max_length (`int`):
            The maximum length of the sequence to be generated.
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
        device (`str`, *optional*, defaults to `"cpu"`):
            The device to allocate the tensors.
@ -1613,7 +1613,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    ```
    """
-    def __init__(self, max_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
+    def __init__(self, max_length: int, eos_token_id: Union[int, list[int], torch.Tensor], device: str = "cpu"):
        self.max_length = max_length
        if not isinstance(eos_token_id, torch.Tensor):
@ -1666,7 +1666,7 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
        exponential_decay_length_penalty (`tuple(int, float)`):
            This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty
            starts and `decay_factor` represents the factor of exponential decay
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
        input_ids_seq_length (`int`):
            The length of the input sequence.
@ -1726,8 +1726,8 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
    def __init__(
        self,
-        exponential_decay_length_penalty: Tuple[int, float],
+        exponential_decay_length_penalty: tuple[int, float],
-        eos_token_id: Union[int, List[int], torch.Tensor],
+        eos_token_id: Union[int, list[int], torch.Tensor],
        input_ids_seq_length: int,
    ):
        self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
@ -2326,13 +2326,13 @@ class BarkEosPrioritizerLogitsProcessor(LogitsProcessor):
    </Tip>
    Args:
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
        min_eos_p (`float`, *optional*):
            Minimum end of speech threshold.
    """
-    def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
+    def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
        if not isinstance(eos_token_id, torch.Tensor):
            if isinstance(eos_token_id, int):
                eos_token_id = [eos_token_id]
@ -2569,7 +2569,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
    Args:
        ngram_len (`int`):
            Ngram length.
-        keys (`List[int]`):
+        keys (`list[int]`):
            A sequence of watermarking keys, one for each depth.
        sampling_table_size (`int`):
            Size of the sampling table.
@ -2610,7 +2610,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
    def __init__(
        self,
        ngram_len: int,
-        keys: List[int],
+        keys: list[int],
        sampling_table_size: int,
        sampling_table_seed: int,
        context_history_size: int,
@ -2808,7 +2808,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
    def _compute_keys(
        self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor
-    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
+    ) -> tuple[torch.LongTensor, torch.LongTensor]:
        """Computes random keys for each ngram and depth.
        Args:
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@ -3,7 +3,7 @@ import warnings
 from abc import ABC
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 import numpy as np
 import torch
@ -33,7 +33,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
            or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
            make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
-        kwargs (`Dict[str, Any]`, *optional*):
+        kwargs (`dict[str, Any]`, *optional*):
            Additional stopping criteria specific kwargs.
    Return:
@ -209,7 +209,7 @@ class StopStringCriteria(StoppingCriteria):
    Args:
        tokenizer (`PreTrainedTokenizer`):
            The model's associated tokenizer (necessary to extract vocab and tokenize the termination sequences)
-        stop_strings (`Union[str, List[str]]`):
+        stop_strings (`Union[str, list[str]]`):
            A list of strings that should end generation. If a string is passed, it will be treated like a
            list with a single element.
@ -239,10 +239,10 @@ class StopStringCriteria(StoppingCriteria):
    ```
    """
-    def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, List[str]]):
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, list[str]]):
        if isinstance(stop_strings, str):
            stop_strings = [stop_strings]
-        self.stop_strings: Tuple[str, ...] = tuple(stop_strings)
+        self.stop_strings: tuple[str, ...] = tuple(stop_strings)
        vocab = tokenizer.get_vocab()
        token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
        self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
@ -298,7 +298,7 @@ class StopStringCriteria(StoppingCriteria):
    @staticmethod
    def _stop_string_get_matching_positions(
        token_list, token_indices, stop_strings
-    ) -> Tuple[Dict[str, Dict[str, List[int]]], Dict[str, Dict[str, List[int]]]]:
+    ) -> tuple[dict[str, dict[str, list[int]]], dict[str, dict[str, list[int]]]]:
        """This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can
        validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the
        token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters
@ -337,7 +337,7 @@ class StopStringCriteria(StoppingCriteria):
        return token_valid_positions, token_end_overlaps
    @staticmethod
-    def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> Dict[str, torch.tensor]:
+    def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> dict[str, torch.tensor]:
        """This function precomputes everything needed for the run-time checks in StopStringCriteria, and packs
        them into an embedding tensor that can be accessed with pure tensor operations. For the specifics of the values
        that are precomputed and what they are used for, please refer to the StopStringCriteria docstring!"""
@ -455,11 +455,11 @@ class EosTokenCriteria(StoppingCriteria):
    By default, it uses the `model.generation_config.eos_token_id`.
    Args:
-        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+        eos_token_id (`Union[int, list[int], torch.Tensor]`):
            The id(s) of the *end-of-sequence* token.
    """
-    def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor]):
+    def __init__(self, eos_token_id: Union[int, list[int], torch.Tensor]):
        if not isinstance(eos_token_id, torch.Tensor):
            if isinstance(eos_token_id, int):
                eos_token_id = [eos_token_id]
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@ -14,7 +14,6 @@
 # limitations under the License.
 import inspect
 from typing import List, Tuple
 import numpy as np
 import tensorflow as tf
@ -42,7 +41,7 @@ TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
        cur_len (`int`):
            The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
            is the maximum length generate can produce, and we need to know which of its tokens are valid.
-        kwargs (`Dict[str, Any]`, *optional*):
+        kwargs (`dict[str, Any]`, *optional*):
            Additional logits processor specific kwargs.
    Return:
@ -290,7 +289,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
    [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
    Args:
-        bad_words_ids (`List[List[int]]`):
+        bad_words_ids (`list[list[int]]`):
            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
@ -300,8 +299,8 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
            The id of the *end-of-sequence* token.
    """
-    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
+    def __init__(self, bad_words_ids: list[list[int]], eos_token_id: int):
-        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
+        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
@ -370,7 +369,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
        # To remain simple and XLA-compatible, we work on a per-row fashion.
        # TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
        # a frequent choke point. (make `cur_len` a tensor?)
-        def _get_row_updated_score(row_inputs: Tuple[tf.Tensor]) -> tf.Tensor:
+        def _get_row_updated_score(row_inputs: tuple[tf.Tensor]) -> tf.Tensor:
            row_input_ids, row_score = row_inputs
            banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
            banned_tokens_mask = tf.scatter_nd(
@ -565,7 +564,7 @@ class TFForceTokensLogitsProcessor(TFLogitsProcessor):
    indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
    `-inf` so that they are sampled at their corresponding index."""
-    def __init__(self, force_token_map: List[List[int]]):
+    def __init__(self, force_token_map: list[list[int]]):
        force_token_map = dict(force_token_map)
        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@ -18,7 +18,7 @@ import copy
 import inspect
 import warnings
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Union
 import numpy as np
 import tensorflow as tf
@ -77,9 +77,9 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -116,12 +116,12 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[tuple[tf.Tensor]] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -147,9 +147,9 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -186,12 +186,12 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[tuple[tf.Tensor]] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -223,10 +223,10 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
    sequences: Optional[tf.Tensor] = None
    sequences_scores: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
    beam_indices: Optional[tf.Tensor] = None
-    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -270,13 +270,13 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
    sequences: Optional[tf.Tensor] = None
    sequences_scores: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
    beam_indices: Optional[tf.Tensor] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[tuple[tf.Tensor]] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -308,10 +308,10 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
    sequences: Optional[tf.Tensor] = None
    sequences_scores: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
    beam_indices: Optional[tf.Tensor] = None
-    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -354,13 +354,13 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
    sequences: Optional[tf.Tensor] = None
    sequences_scores: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
    beam_indices: Optional[tf.Tensor] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[tuple[tf.Tensor]] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -385,9 +385,9 @@ class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
@dataclass
@ -423,12 +423,12 @@ class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
    """
    sequences: Optional[tf.Tensor] = None
-    scores: Optional[Tuple[tf.Tensor]] = None
+    scores: Optional[tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[tuple[tf.Tensor]] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[tuple[tuple[tf.Tensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[tf.Tensor]]] = None
 TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput]
@ -477,7 +477,7 @@ class TFGenerationMixin:
    def compute_transition_scores(
        self,
        sequences: tf.Tensor,
-        scores: Tuple[tf.Tensor],
+        scores: tuple[tf.Tensor],
        beam_indices: Optional[tf.Tensor] = None,
        normalize_logits: bool = False,
    ) -> tf.Tensor:
@ -619,7 +619,7 @@ class TFGenerationMixin:
                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
            raise TypeError(exception_message)
-    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+    def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
        # Excludes arguments that are handled before calling any model function
        if self.config.is_encoder_decoder:
@ -681,10 +681,10 @@ class TFGenerationMixin:
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
-            seed (`List[int]`, *optional*):
+            seed (`list[int]`, *optional*):
                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                `seed` argument from stateless functions in `tf.random`.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -1044,7 +1044,7 @@ class TFGenerationMixin:
    def _prepare_encoder_decoder_kwargs_for_generation(
        self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        # 1. get encoder and store encoder outputs
        encoder = self.get_encoder()
@ -1076,10 +1076,10 @@ class TFGenerationMixin:
        self,
        batch_size: int,
        model_input_name: str,
-        model_kwargs: Dict[str, tf.Tensor],
+        model_kwargs: dict[str, tf.Tensor],
        decoder_start_token_id: Optional[int] = None,
        bos_token_id: Optional[int] = None,
-    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+    ) -> tuple[tf.Tensor, dict[str, tf.Tensor]]:
        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
@ -1138,7 +1138,7 @@ class TFGenerationMixin:
        input_ids: Optional[tf.Tensor] = None,
        expand_in_new_axis: bool = False,
        **model_kwargs,
-    ) -> Tuple[tf.Tensor, Dict[str, Any]]:
+    ) -> tuple[tf.Tensor, dict[str, Any]]:
        """
        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
        depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
@ -1174,8 +1174,8 @@ class TFGenerationMixin:
        self,
        inputs: Optional[tf.Tensor] = None,
        bos_token_id: Optional[int] = None,
-        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+        model_kwargs: Optional[dict[str, tf.Tensor]] = None,
-    ) -> Tuple[tf.Tensor, Optional[str], Dict[str, tf.Tensor]]:
+    ) -> tuple[tf.Tensor, Optional[str], dict[str, tf.Tensor]]:
        """
        This function extracts the model-specific `inputs` for generation.
        """
@ -1240,7 +1240,7 @@ class TFGenerationMixin:
        self,
        inputs: Optional[tf.Tensor] = None,
        bos_token_id: Optional[int] = None,
-        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+        model_kwargs: Optional[dict[str, tf.Tensor]] = None,
    ) -> tf.Tensor:
        """Initializes input ids for generation, if necessary."""
        if inputs is not None:
@ -1276,8 +1276,8 @@ class TFGenerationMixin:
        return past_key_values
    def _update_model_kwargs_for_generation(
-        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
+        self, outputs: ModelOutput, model_kwargs: dict[str, Any], is_encoder_decoder: bool = False
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        # update past_key_values
        model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs)
@ -1294,7 +1294,7 @@ class TFGenerationMixin:
    def _update_model_kwargs_for_xla_generation(
        self,
        model_outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
+        model_kwargs: dict[str, Any],
        cur_len: int,
        max_length: int,
        batch_size: int,
@ -1550,7 +1550,7 @@ class TFGenerationMixin:
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
+            eos_token_id (`Union[int, list[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@ -1794,7 +1794,7 @@ class TFGenerationMixin:
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
-        seed: Optional[Tuple[int, int]] = None,
+        seed: Optional[tuple[int, int]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
@ -1818,9 +1818,9 @@ class TFGenerationMixin:
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
+            eos_token_id (`Union[int, list[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            seed (`List[int]`, *optional*):
+            seed (`list[int]`, *optional*):
                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                `seed` argument from stateless functions in `tf.random`.
            output_attentions (`bool`, *optional*, defaults to `False`):
@ -2128,7 +2128,7 @@ class TFGenerationMixin:
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
+            eos_token_id (`Union[int, list[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            length_penalty (`float`, *optional*, defaults to 1.0):
                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
@ -2719,7 +2719,7 @@ class TFGenerationMixin:
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
+            eos_token_id (`Union[int, list[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -18,7 +18,7 @@ import inspect
 import os
 import warnings
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 import numpy as np
 import torch
@ -169,11 +169,11 @@ class GenerateDecoderOnlyOutput(ModelOutput):
    """
    sequences: torch.LongTensor
-    scores: Optional[Tuple[torch.FloatTensor]] = None
+    scores: Optional[tuple[torch.FloatTensor]] = None
-    logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+    past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass
@ -214,14 +214,14 @@ class GenerateEncoderDecoderOutput(ModelOutput):
    """
    sequences: torch.LongTensor
-    scores: Optional[Tuple[torch.FloatTensor]] = None
+    scores: Optional[tuple[torch.FloatTensor]] = None
-    logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+    past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass
@ -260,12 +260,12 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
    sequences: torch.LongTensor
    sequences_scores: Optional[torch.FloatTensor] = None
-    scores: Optional[Tuple[torch.FloatTensor]] = None
+    scores: Optional[tuple[torch.FloatTensor]] = None
-    logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
    beam_indices: Optional[torch.LongTensor] = None
-    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+    past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
@dataclass
@ -314,15 +314,15 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
    sequences: torch.LongTensor
    sequences_scores: Optional[torch.FloatTensor] = None
-    scores: Optional[Tuple[torch.FloatTensor]] = None
+    scores: Optional[tuple[torch.FloatTensor]] = None
-    logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
    beam_indices: Optional[torch.LongTensor] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
-    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+    past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
 # TODO (joao): remove the equivalent classes and typing shortcuts below in v5
@ -457,7 +457,7 @@ class GenerationMixin(ContinuousMixin):
        input_ids: torch.LongTensor,
        inputs_embeds: Optional[torch.FloatTensor],
        cache_position: Optional[torch.LongTensor],
-    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+    ) -> tuple[torch.FloatTensor, torch.LongTensor]:
        """
        Generic cache-dependent input preparation
        The code is put in a separate function to allow granular unit testing
@ -491,7 +491,7 @@ class GenerationMixin(ContinuousMixin):
        input_ids: torch.LongTensor,
        inputs_embeds: Optional[torch.FloatTensor],
        cache_position: Optional[torch.LongTensor],
-    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+    ) -> tuple[torch.FloatTensor, torch.LongTensor]:
        """
        This method implements method ``_cache_dependant_input_preparation``
        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
@ -697,8 +697,8 @@ class GenerationMixin(ContinuousMixin):
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[torch.Tensor] = None,
-        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
        """
        This function extracts the model-specific `inputs` for generation.
        """
@ -761,7 +761,7 @@ class GenerationMixin(ContinuousMixin):
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[torch.Tensor] = None,
-        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
    ) -> torch.LongTensor:
        """Initializes input ids for generation, if necessary."""
        if inputs is not None:
@ -793,7 +793,7 @@ class GenerationMixin(ContinuousMixin):
        self,
        inputs_tensor: torch.Tensor,
        generation_config: GenerationConfig,
-        model_kwargs: Dict[str, Any],
+        model_kwargs: dict[str, Any],
    ) -> torch.LongTensor:
        pad_token_id = generation_config._pad_token_tensor
        eos_token_id = generation_config._eos_token_tensor
@ -831,7 +831,7 @@ class GenerationMixin(ContinuousMixin):
        model_kwargs,
        model_input_name: Optional[str],
        generation_config: GenerationConfig,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        # 1. get encoder
        encoder = self.get_encoder()
        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
@ -870,10 +870,10 @@ class GenerationMixin(ContinuousMixin):
        self,
        batch_size: int,
        model_input_name: str,
-        model_kwargs: Dict[str, torch.Tensor],
+        model_kwargs: dict[str, torch.Tensor],
        decoder_start_token_id: torch.Tensor,
        device: Optional[torch.device] = None,
-    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+    ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
@ -931,7 +931,7 @@ class GenerationMixin(ContinuousMixin):
        is_encoder_decoder: bool = False,
        input_ids: Optional[torch.LongTensor] = None,
        **model_kwargs,
-    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
        # the input tensor and thus requires more memory although no change is applied
@ -963,10 +963,10 @@ class GenerationMixin(ContinuousMixin):
    def _update_model_kwargs_for_generation(
        self,
        outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
+        model_kwargs: dict[str, Any],
        is_encoder_decoder: bool = False,
        num_new_tokens: int = 1,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        # update past_key_values keeping its naming used in model code
        for possible_cache_name in ALL_CACHE_NAMES:
            if possible_cache_name in outputs:
@ -1024,7 +1024,7 @@ class GenerationMixin(ContinuousMixin):
        logits_processor: LogitsProcessorList,
        target_tokenizer: "PreTrainedTokenizerBase",
        assistant_tokenizer: "PreTrainedTokenizerBase",
-        model_kwargs: Dict,
+        model_kwargs: dict,
    ) -> CandidateGenerator:
        """
        Returns the candidate generator to be used in `assisted_generation`
@ -1100,10 +1100,10 @@ class GenerationMixin(ContinuousMixin):
        generation_config: GenerationConfig,
        input_ids_seq_length: Optional[int] = None,
        encoder_input_ids: torch.LongTensor = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        device: Optional[str] = None,
-        model_kwargs: Optional[Dict[str, Any]] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
        negative_prompt_ids: Optional[torch.Tensor] = None,
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
    ) -> LogitsProcessorList:
@ -1403,7 +1403,7 @@ class GenerationMixin(ContinuousMixin):
    def compute_transition_scores(
        self,
        sequences: torch.Tensor,
-        scores: Tuple[torch.Tensor],
+        scores: tuple[torch.Tensor],
        beam_indices: Optional[torch.Tensor] = None,
        normalize_logits: bool = False,
    ) -> torch.Tensor:
@ -1552,7 +1552,7 @@ class GenerationMixin(ContinuousMixin):
                    f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}."
                )
-    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+    def _validate_model_kwargs(self, model_kwargs: dict[str, Any]):
        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
        # If a `Cache` instance is passed, checks whether the model is compatible with it
        if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
@ -1709,8 +1709,8 @@ class GenerationMixin(ContinuousMixin):
        return generation_config
    def _prepare_generation_config(
-        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Dict
+        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: dict
-    ) -> Tuple[GenerationConfig, Dict]:
+    ) -> tuple[GenerationConfig, dict]:
        """
        Prepares the base generation config, then applies any generation configuration options from kwargs. This
        function handles retrocompatibility with respect to configuration files.
@ -1821,7 +1821,7 @@ class GenerationMixin(ContinuousMixin):
        model_kwargs["cache_position"] = cache_position
        return model_kwargs
-    def _get_layer_device_map_for_cache_init(self) -> Optional[Dict[int, Union[str, int]]]:
+    def _get_layer_device_map_for_cache_init(self) -> Optional[dict[int, Union[str, int]]]:
        """
        Returns the device map for each decoder layer, to allocate the cache on the right device.
        Inspired from `dispatch_model` in accelerate.
@ -1982,7 +1982,7 @@ class GenerationMixin(ContinuousMixin):
    def _prepare_cache_for_generation(
        self,
        generation_config: GenerationConfig,
-        model_kwargs: Dict,
+        model_kwargs: dict,
        assistant_model: "PreTrainedModel",
        batch_size: int,
        max_cache_length: int,
@ -2191,7 +2191,7 @@ class GenerationMixin(ContinuousMixin):
        generation_config._pad_token_tensor = pad_token_tensor
        generation_config._decoder_start_token_tensor = decoder_start_token_tensor
-    def _valid_auto_compile_criteria(self, model_kwargs: Dict, generation_config: GenerationConfig) -> bool:
+    def _valid_auto_compile_criteria(self, model_kwargs: dict, generation_config: GenerationConfig) -> bool:
        """
        Determines whether to trigger auto-compilation of the model's forward pass at generation time.
        """
@ -2239,7 +2239,7 @@ class GenerationMixin(ContinuousMixin):
        generation_config: Optional[GenerationConfig] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
        synced_gpus: Optional[bool] = None,
        assistant_model: Optional["PreTrainedModel"] = None,
        streamer: Optional["BaseStreamer"] = None,
@ -2287,7 +2287,7 @@ class GenerationMixin(ContinuousMixin):
                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
                intended for advanced users.
-            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
@ -2321,7 +2321,7 @@ class GenerationMixin(ContinuousMixin):
                function defined in that reposity's `custom_generate/generate.py` file will be executed instead of the
                standard `generate` method. Note that the logic is for generation is entirely defined in that
                repository, and the return type may be different from the standard `generate` method.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@ -2695,7 +2695,7 @@ class GenerationMixin(ContinuousMixin):
                def typeerror():
                    raise ValueError(
-                        "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
+                        "`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]` "
                        f"of positive integers, but is {generation_config.force_words_ids}."
                    )
@ -2871,7 +2871,7 @@ class GenerationMixin(ContinuousMixin):
    def _dola_decoding(
        self,
        input_ids: torch.LongTensor,
-        dola_layers: Union[str, List[int]],
+        dola_layers: Union[str, list[int]],
        logits_processor: LogitsProcessorList,
        stopping_criteria: StoppingCriteriaList,
        generation_config: GenerationConfig,
@ -2888,7 +2888,7 @@ class GenerationMixin(ContinuousMixin):
        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
-            dola_layers (`Union[str, List[int]]`):
+            dola_layers (`Union[str, list[int]]`):
                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
@ -3806,7 +3806,7 @@ class GenerationMixin(ContinuousMixin):
        num_beams: int,
        vocab_size: int,
        batch_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Get top-K continuations given the accumulated log probs on the next token.
@ -3855,7 +3855,7 @@ class GenerationMixin(ContinuousMixin):
        topk_running_beam_indices: torch.Tensor,
        next_token_hits_stopping_criteria: torch.Tensor,
        num_beams: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
        best non-finished beams to continue beam search in the next iteration.
@ -3886,7 +3886,7 @@ class GenerationMixin(ContinuousMixin):
        decoder_prompt_len: int,
        length_penalty: float,
        early_stopping: Union[bool, str],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
        the current finished sequences.
@ -5236,8 +5236,8 @@ def _split(data, full_batch_size: int, split_size: int):
 def _split_model_inputs(
-    model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int, config: PretrainedConfig
+    model_input: Union[ModelOutput, dict], split_size: int, full_batch_size: int, config: PretrainedConfig
-) -> List[Union[ModelOutput, Dict]]:
+) -> list[Union[ModelOutput, dict]]:
    """
    Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
    size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
@ -5292,14 +5292,14 @@ def _split_model_inputs(
        ]
    # Convert each dictionary in the list to an object of the inferred class
-    split_model_inputs: List[Union[ModelOutput, Dict]] = [
+    split_model_inputs: list[Union[ModelOutput, dict]] = [
        model_output_cls(**data_split, **bool_data) for data_split in data_split_list
    ]
    return split_model_inputs
-def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConfig) -> ModelOutput:
+def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConfig) -> ModelOutput:
    """
    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
    specific ModelOutput subclass from the list provided.
@ -5379,8 +5379,8 @@ def _relative_top_filter(
 def _dola_select_contrast(
-    candidate_premature_layers: List[int],
+    candidate_premature_layers: list[int],
-    candidate_premature_logits: Dict[int, torch.FloatTensor],
+    candidate_premature_logits: dict[int, torch.FloatTensor],
    final_logits: torch.FloatTensor,
 ) -> torch.FloatTensor:
    if len(candidate_premature_layers) == 1:
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@ -16,7 +16,7 @@
 import collections
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Union
 import numpy as np
 import torch
@ -126,7 +126,7 @@ class WatermarkDetector:
        self,
        model_config: PretrainedConfig,
        device: str,
-        watermarking_config: Union[WatermarkingConfig, Dict],
+        watermarking_config: Union[WatermarkingConfig, dict],
        ignore_repeated_ngrams: bool = False,
        max_cache_size: int = 128,
    ):
@ -300,7 +300,7 @@ class BayesianDetectorWatermarkedLikelihood(nn.Module):
        self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth))
        self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth))
-    def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _compute_latents(self, g_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Computes the unique token probability distribution given g-values.
        Args:
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@ -81,7 +81,7 @@ def HfArg(
    ```
    Args:
-        aliases (Union[str, List[str]], optional):
+        aliases (Union[str, list[str]], optional):
            Single string or list of strings of aliases to pass on to argparse, e.g. `aliases=["--example", "-e"]`.
            Defaults to None.
        help (str, optional): Help string to pass on to argparse that can be displayed with --help. Defaults to None.
@ -119,7 +119,7 @@ class HfArgumentParser(ArgumentParser):
    Args:
        dataclass_types (`DataClassType` or `Iterable[DataClassType]`, *optional*):
            Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
-        kwargs (`Dict[str, Any]`, *optional*):
+        kwargs (`dict[str, Any]`, *optional*):
            Passed to `argparse.ArgumentParser()` in the regular way.
    """
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@ -127,7 +127,7 @@ class ImageProcessingMixin(PushToHubMixin):
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
-            proxies (`Dict[str, str]`, *optional*):
+            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            token (`str` or `bool`, *optional*):
@ -153,7 +153,7 @@ class ImageProcessingMixin(PushToHubMixin):
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are image processor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
                controlled by the `return_unused_kwargs` keyword parameter.
@ -219,7 +219,7 @@ class ImageProcessingMixin(PushToHubMixin):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        """
        use_auth_token = kwargs.pop("use_auth_token", None)
@ -286,7 +286,7 @@ class ImageProcessingMixin(PushToHubMixin):
                The name of the file in the model directory to use for the image processor config.
        Returns:
-            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
        """
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
@ -387,11 +387,11 @@ class ImageProcessingMixin(PushToHubMixin):
        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
        Args:
-            image_processor_dict (`Dict[str, Any]`):
+            image_processor_dict (`dict[str, Any]`):
                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
-            kwargs (`Dict[str, Any]`):
+            kwargs (`dict[str, Any]`):
                Additional parameters from which to initialize the image processor object.
        Returns:
@ -431,7 +431,7 @@ class ImageProcessingMixin(PushToHubMixin):
        Serializes this instance to a Python dictionary.
        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+            `dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
        """
        output = copy.deepcopy(self.__dict__)
        output["image_processor_type"] = self.__class__.__name__
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@ -130,7 +130,7 @@ class BaseImageProcessor(ImageProcessingMixin):
        Args:
            image (`np.ndarray`):
                Image to center crop.
-            size (`Dict[str, int]`):
+            size (`dict[str, int]`):
                Size of the output image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
@ -227,7 +227,7 @@ def get_size_dict(
      is set, it is added to the dict as `{"longest_edge": max_size}`.
    Args:
-        size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
+        size (`Union[int, Iterable[int], dict[str, int]]`, *optional*):
            The `size` parameter to be cast into a size dictionary.
        max_size (`Optional[int]`, *optional*):
            The `max_size` parameter to be cast into a size dictionary.
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@ -382,7 +382,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
        Args:
            image (`"torch.Tensor"`):
                Image to center crop.
-            size (`Dict[str, int]`):
+            size (`dict[str, int]`):
                Size of the output image.
        Returns:
@ -666,12 +666,12 @@ class SemanticSegmentationMixin:
        Args:
            outputs ([`MobileNetV2ForSemanticSegmentation`]):
                Raw outputs of the model.
-            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+            target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.
        Returns:
-            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
        """
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -217,7 +217,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, in
    Computes the output image size given the input image size and the desired output size.
    Args:
-        image_size (`Tuple[int, int]`):
+        image_size (`tuple[int, int]`):
            The input image size.
        size (`int`):
            The desired output size.
@ -266,7 +266,7 @@ def get_resize_output_image_size(
    Args:
        input_image (`np.ndarray`):
            The image to resize.
-        size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
+        size (`int` or `tuple[int, int]` or list[int] or `tuple[int]`):
            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
            this.
@ -334,7 +334,7 @@ def resize(
    Args:
        image (`np.ndarray`):
            The image to resize.
-        size (`Tuple[int, int]`):
+        size (`tuple[int, int]`):
            The size to use for resizing the image.
        resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            The filter to user for resampling.
@ -464,7 +464,7 @@ def center_crop(
    Args:
        image (`np.ndarray`):
            The image to crop.
-        size (`Tuple[int, int]`):
+        size (`tuple[int, int]`):
            The target size for the cropped image.
        data_format (`str` or `ChannelDimension`, *optional*):
            The channel dimension format for the output image. Can be one of:
@ -704,7 +704,7 @@ def pad(
    Args:
        image (`np.ndarray`):
            The image to pad.
-        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
+        padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
            Padding to apply to the edges of the height, width axes. Can be one of three formats:
            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
            - `((before, after),)` yields same before and after pad for height and width.
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -218,7 +218,7 @@ def make_flat_list_of_images(
    Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a nested list of images, it is converted to a flat list of images.
    Args:
-        images (`Union[List[ImageInput], ImageInput]`):
+        images (`Union[list[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images or a 4d array of images.
@ -252,7 +252,7 @@ def make_nested_list_of_images(
    """
    Ensure that the output is a nested list of images.
    Args:
-        images (`Union[List[ImageInput], ImageInput]`):
+        images (`Union[list[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of list of images or a list of 4d array of images.
@ -300,7 +300,7 @@ def infer_channel_dimension_format(
    Args:
        image (`np.ndarray`):
            The image to infer the channel dimension of.
-        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+        num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
            The number of channels of the image.
    Returns:
@ -393,7 +393,7 @@ def get_image_size_for_max_height_width(
        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
    Args:
-        image_size (`Tuple[int, int]`):
+        image_size (`tuple[int, int]`):
            The image to resize.
        max_height (`int`):
            The maximum allowed height.
@ -678,9 +678,9 @@ class ImageFeatureExtractionMixin:
        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to normalize.
-            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
+            mean (`list[float]` or `np.ndarray` or `torch.Tensor`):
                The mean (per channel) to use for normalization.
-            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
+            std (`list[float]` or `np.ndarray` or `torch.Tensor`):
                The standard deviation (per channel) to use for normalization.
            rescale (`bool`, *optional*, defaults to `False`):
                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
@ -729,7 +729,7 @@ class ImageFeatureExtractionMixin:
        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
-            size (`int` or `Tuple[int, int]`):
+            size (`int` or `tuple[int, int]`):
                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
                matched to this.
@ -797,7 +797,7 @@ class ImageFeatureExtractionMixin:
        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
                The image to resize.
-            size (`int` or `Tuple[int, int]`):
+            size (`int` or `tuple[int, int]`):
                The size to which crop the image.
        Returns:
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@ -156,7 +156,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
        model (`torch.nn.Module`): The model to inspect.
    Returns:
-        List[List[str]]: A list of lists of parameter names being all tied together.
+        list[list[str]]: A list of lists of parameter names being all tied together.
    Example:
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@ -306,7 +306,7 @@ def _fuse_awq_layernorm(fuse_module_names, module, target_cls):
    Fuse the LayerNorm layers into a target class using autoawq
    Args:
-        fuse_module_names (`List[str]`):
+        fuse_module_names (`list[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
@ -333,7 +333,7 @@ def _fuse_awq_mlp(model, current_module_name, fuse_module_names, module, target_
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
-        fuse_module_names (`List[str]`):
+        fuse_module_names (`list[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
@ -374,7 +374,7 @@ def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_na
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
-        modules_to_fuse (`List[str]`):
+        modules_to_fuse (`list[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
--- a/src/transformers/integrations/bitnet.py
+++ b/src/transformers/integrations/bitnet.py
@ -398,10 +398,10 @@ def replace_with_bitnet_linear(
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `BitLinear`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
-        current_key_name (`List[`str`]`, *optional*):
+        current_key_name (`list[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@ -243,10 +243,10 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
-        current_key_name (`List[`str`]`, *optional*):
+        current_key_name (`list[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
--- a/src/transformers/integrations/eetq.py
+++ b/src/transformers/integrations/eetq.py
@ -93,10 +93,10 @@ def replace_with_eetq_linear(
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
-        current_key_name (`List[`str`]`, *optional*):
+        current_key_name (`list[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
--- a/src/transformers/integrations/fbgemm_fp8.py
+++ b/src/transformers/integrations/fbgemm_fp8.py
@ -251,10 +251,10 @@ def replace_with_fbgemm_fp8_linear(
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
-        current_key_name (`List[`str`]`, *optional*):
+        current_key_name (`list[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
--- a/src/transformers/integrations/finegrained_fp8.py
+++ b/src/transformers/integrations/finegrained_fp8.py
@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Tuple
+from typing import Optional
 from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging
@ -45,7 +45,7 @@ def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
    tl.store(s_ptr + pid, s)
-def act_quant(x: torch.Tensor, block_size: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+def act_quant(x: torch.Tensor, block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
    assert x.is_contiguous()
    assert x.shape[-1] % block_size == 0
    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
@ -149,7 +149,7 @@ def w8a8_block_fp8_matmul_triton(
    B: torch.Tensor,
    As: torch.Tensor,
    Bs: torch.Tensor,
-    block_size: List[int],
+    block_size: list[int],
    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
    """This function performs matrix multiplication with block-wise
@ -231,7 +231,7 @@ def w8a8_block_fp8_matmul_compile(
    weight_q: torch.Tensor,  # [out_features, hidden_dim]
    input_scale: torch.Tensor,  # [batch * seq_len, num_input_groups]
    weight_scale: torch.Tensor,  # [num_weight_blocks_m, num_weight_blocks_n]
-    block_size: Optional[Tuple[int, int]] = None,  # (M=128, N=128) for weights for example
+    block_size: Optional[tuple[int, int]] = None,  # (M=128, N=128) for weights for example
    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
    """
@ -300,7 +300,7 @@ class FP8Linear(nn.Linear):
        out_features: int,
        bias: bool = False,
        dtype=None,
-        block_size: Optional[Tuple[int, int]] = None,
+        block_size: Optional[tuple[int, int]] = None,
        device=None,
        activation_scheme="dynamic",
    ):
--- a/src/transformers/integrations/flash_attention.py
+++ b/src/transformers/integrations/flash_attention.py
@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from typing import Optional
 import torch
@ -22,7 +22,7 @@ def flash_attention_forward(
    sliding_window: Optional[int] = None,
    softcap: Optional[float] = None,
    **kwargs,
-) -> Tuple[torch.Tensor, None]:
+) -> tuple[torch.Tensor, None]:
    if kwargs.get("output_attentions", False) or kwargs.get("head_mask", None) is not None:
        logger.warning_once(
            "`flash_attention_2` does not support `output_attentions=True` or `head_mask`."
--- a/src/transformers/integrations/flex_attention.py
+++ b/src/transformers/integrations/flex_attention.py
@ -26,7 +26,7 @@ Citation:
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 from packaging import version
@ -106,7 +106,7 @@ def make_flex_block_causal_mask(
    attention_chunk_size: Optional[int] = None,
    query_length=None,
    key_length=None,
-    offsets: Optional[Tuple[Offset, Offset]] = None,
+    offsets: Optional[tuple[Offset, Offset]] = None,
    is_causal: Optional[bool] = True,
 ) -> "BlockMask":
    """
@ -234,7 +234,7 @@ def flex_attention_forward(
    softcap: Optional[float] = None,
    head_mask: Optional[torch.Tensor] = None,
    **kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
    if head_mask is not None:
        logger.warning_once(
            "`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature."
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Union
+from typing import Union
 from ..utils import is_torchdynamo_compiling
@ -29,7 +29,7 @@ try:
    _hub_kernels_available = True
-    _KERNEL_MAPPING: Dict[str, Dict[Union[Device, str], LayerRepository]] = {
+    _KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
        "MultiScaleDeformableAttention": {
            "cuda": LayerRepository(
                repo_id="kernels-community/deformable-detr",
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -29,7 +29,7 @@ import tempfile
 from dataclasses import asdict, fields
 from enum import Enum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 import numpy as np
 import packaging.version
@ -1692,7 +1692,7 @@ class NeptuneCallback(TrainerCallback):
        raise Exception("The trainer doesn't have a NeptuneCallback configured.")
-    def on_log(self, args, state, control, logs: Optional[Dict[str, float]] = None, **kwargs):
+    def on_log(self, args, state, control, logs: Optional[dict[str, float]] = None, **kwargs):
        if not state.is_world_process_zero:
            return
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -16,7 +16,7 @@ import importlib
 import inspect
 import re
 import warnings
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 from packaging import version
@ -100,11 +100,11 @@ class PeftAdapterMixin:
        max_memory: Optional[str] = None,
        offload_folder: Optional[str] = None,
        offload_index: Optional[int] = None,
-        peft_config: Optional[Dict[str, Any]] = None,
+        peft_config: Optional[dict[str, Any]] = None,
-        adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None,
+        adapter_state_dict: Optional[dict[str, "torch.Tensor"]] = None,
        low_cpu_mem_usage: bool = False,
        is_trainable: bool = False,
-        adapter_kwargs: Optional[Dict[str, Any]] = None,
+        adapter_kwargs: Optional[dict[str, Any]] = None,
    ) -> None:
        """
        Load adapter weights from file or remote Hub folder. If you are not familiar with adapters and PEFT methods, we
@ -133,7 +133,7 @@ class PeftAdapterMixin:
                Whether to use authentication token to load the remote folder. Useful to load private repositories
                that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to
                cache it.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
+            device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
@ -150,10 +150,10 @@ class PeftAdapterMixin:
                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
            offload_index (`int`, `optional`):
                `offload_index` argument to be passed to `accelerate.dispatch_model` method.
-            peft_config (`Dict[str, Any]`, *optional*):
+            peft_config (`dict[str, Any]`, *optional*):
                The configuration of the adapter to add, supported adapters are non-prefix tuning and adaption prompts
                methods. This argument is used in case users directly pass PEFT state dicts
-            adapter_state_dict (`Dict[str, torch.Tensor]`, *optional*):
+            adapter_state_dict (`dict[str, torch.Tensor]`, *optional*):
                The state dict of the adapter to load. This argument is used in case users directly pass PEFT state
                dicts
            low_cpu_mem_usage (`bool`, *optional*, defaults to `False`):
@ -162,7 +162,7 @@ class PeftAdapterMixin:
            is_trainable (`bool`, *optional*, defaults to `False`):
                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
                used for inference.
-            adapter_kwargs (`Dict[str, Any]`, *optional*):
+            adapter_kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and
                `find_adapter_config_file` method.
        """
@ -348,7 +348,7 @@ class PeftAdapterMixin:
        self.set_adapter(adapter_name)
-    def set_adapter(self, adapter_name: Union[List[str], str]) -> None:
+    def set_adapter(self, adapter_name: Union[list[str], str]) -> None:
        """
        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
        official documentation: https://huggingface.co/docs/peft
@ -356,7 +356,7 @@ class PeftAdapterMixin:
        Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
        Args:
-            adapter_name (`Union[List[str], str]`):
+            adapter_name (`Union[list[str], str]`):
                The name of the adapter to set. Can be also a list of strings to set multiple adapters.
        """
        check_peft_version(min_version=MIN_PEFT_VERSION)
@ -438,7 +438,7 @@ class PeftAdapterMixin:
                else:
                    module.disable_adapters = False
-    def active_adapters(self) -> List[str]:
+    def active_adapters(self) -> list[str]:
        """
        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
        official documentation: https://huggingface.co/docs/peft
@ -518,7 +518,7 @@ class PeftAdapterMixin:
        accelerate (i.e. with `device_map=xxx`)
        Args:
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
+            device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
@ -562,12 +562,12 @@ class PeftAdapterMixin:
            **dispatch_model_kwargs,
        )
-    def delete_adapter(self, adapter_names: Union[List[str], str]) -> None:
+    def delete_adapter(self, adapter_names: Union[list[str], str]) -> None:
        """
        Delete an adapter's LoRA layers from the underlying model.
        Args:
-            adapter_names (`Union[List[str], str]`):
+            adapter_names (`Union[list[str], str]`):
                The name(s) of the adapter(s) to delete.
        Example:
--- a/Show More
+++ b/Show More