Post-PR fixes! (#38868)

* Post-PR fixes! * make fix-copies
2025-07-03 12:50:06 +06:00 · 2025-06-17 19:58:47 +01:00 · 2025-06-17 19:58:47 +01:00 · d058f81e5b
commit d058f81e5b
parent 508a704055
4 changed files with 40 additions and 41 deletions
--- a/src/transformers/models/lightglue/convert_lightglue_to_hf.py
+++ b/src/transformers/models/lightglue/convert_lightglue_to_hf.py
@ -15,7 +15,6 @@ import argparse
 import gc
 import os
 import re
-from typing import List

 import torch
 from datasets import load_dataset
@ -90,7 +89,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 }


-def convert_old_keys_to_new_keys(state_dict_keys: List[str]):
+def convert_old_keys_to_new_keys(state_dict_keys: list[str]):
    """
    This function should be applied only once, on the concatenated keys to efficiently rename using
    the key mappings.
--- a/src/transformers/models/lightglue/image_processing_lightglue.py
+++ b/src/transformers/models/lightglue/image_processing_lightglue.py
@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union

 import numpy as np
 import torch
@ -139,7 +139,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
            by `do_resize` in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
+        size (`dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
            Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
            `True`. Can be overridden by `size` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
@ -159,7 +159,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
    def __init__(
        self,
        do_resize: bool = True,
-        size: Optional[Dict[str, int]] = None,
+        size: Optional[dict[str, int]] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
@ -180,7 +180,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
    def resize(
        self,
        image: np.ndarray,
-        size: Dict[str, int],
+        size: dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
@ -191,7 +191,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
        Args:
            image (`np.ndarray`):
                Image to resize.
-            size (`Dict[str, int]`):
+            size (`dict[str, int]`):
                Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the output image. If not provided, it will be inferred from the input
@ -220,7 +220,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
        self,
        images,
        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
+        size: Optional[dict[str, int]] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
@ -240,7 +240,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
                `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image
                is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the
                image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to
@ -337,23 +337,23 @@ class LightGlueImageProcessor(BaseImageProcessor):
    def post_process_keypoint_matching(
        self,
        outputs: LightGlueKeypointMatchingOutput,
-        target_sizes: Union[TensorType, List[Tuple]],
+        target_sizes: Union[TensorType, list[tuple]],
        threshold: float = 0.0,
-    ) -> List[Dict[str, torch.Tensor]]:
+    ) -> list[dict[str, torch.Tensor]]:
        """
        Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors
        with coordinates absolute to the original image sizes.
        Args:
            outputs ([`KeypointMatchingOutput`]):
                Raw outputs of the model.
-            target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*):
-                Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the
+            target_sizes (`torch.Tensor` or `list[tuple[tuple[int, int]]]`, *optional*):
+                Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`tuple[int, int]`) containing the
                target size `(height, width)` of each image in the batch. This must be the original image size (before
                any processing).
            threshold (`float`, *optional*, defaults to 0.0):
                Threshold to filter out the matches with low scores.
        Returns:
-            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image
+            `list[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image
            of the pair, the matching scores and the matching indices.
        """
        if outputs.mask.shape[0] != len(target_sizes):
@ -361,7 +361,7 @@ class LightGlueImageProcessor(BaseImageProcessor):
        if not all(len(target_size) == 2 for target_size in target_sizes):
            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

-        if isinstance(target_sizes, List):
+        if isinstance(target_sizes, list):
            image_pair_sizes = torch.tensor(target_sizes, device=outputs.mask.device)
        else:
            if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2:
--- a/src/transformers/models/lightglue/modeling_lightglue.py
+++ b/src/transformers/models/lightglue/modeling_lightglue.py
@ -18,7 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Union

 import numpy as np
 import torch
@ -74,8 +74,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput):
    keypoints: Optional[torch.FloatTensor] = None
    prune: Optional[torch.IntTensor] = None
    mask: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None


 class LightGluePositionalEncoder(nn.Module):
@ -85,7 +85,7 @@ class LightGluePositionalEncoder(nn.Module):

    def forward(
        self, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False
-    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
        projected_keypoints = self.projector(keypoints)
        embeddings = projected_keypoints.repeat_interleave(2, dim=-1)
        cosines = torch.cos(embeddings)
@ -200,12 +200,12 @@ class LightGlueAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -274,7 +274,7 @@ class LightGlueTransformerLayer(nn.Module):
        attention_mask: torch.Tensor,
        output_hidden_states: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor]], Optional[tuple[torch.Tensor]]]:
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

@ -435,7 +435,7 @@ class LightGluePreTrainedModel(PreTrainedModel):
            module.weight.data.fill_(1.0)


-def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> Tuple[torch.Tensor, torch.Tensor]:
+def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> tuple[torch.Tensor, torch.Tensor]:
    """obtain matches from a score matrix [Bx M+1 x N+1]"""
    batch_size, _, _ = scores.shape
    # For each keypoint, get the best match
@ -548,7 +548,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):

    def _keypoint_processing(
        self, descriptors: torch.Tensor, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        descriptors = descriptors.detach().contiguous()
        projected_descriptors = self.input_projection(descriptors)
        keypoint_encoding_output = self.positional_encoder(keypoints, output_hidden_states=output_hidden_states)
@ -659,7 +659,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        matches: torch.Tensor,
        matching_scores: torch.Tensor,
        num_keypoints: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # (batch_size, num_keypoints) -> (batch_size // 2, 2, num_keypoints) -> 2 * (batch_size // 2, num_keypoints) to
        # have tensors from
        batch_size, _ = indices.shape
@ -699,7 +699,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        mask: torch.Tensor = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple, Tuple]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, tuple, tuple]:
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

@ -875,7 +875,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-    ) -> Union[Tuple, LightGlueKeypointMatchingOutput]:
+    ) -> Union[tuple, LightGlueKeypointMatchingOutput]:
        loss = None
        if labels is not None:
            raise ValueError("LightGlue is not trainable, no labels should be provided.")
--- a/src/transformers/models/lightglue/modular_lightglue.py
+++ b/src/transformers/models/lightglue/modular_lightglue.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union

 import numpy as np
 import torch
@ -196,17 +196,17 @@ class LightGlueKeypointMatchingOutput(ModelOutput):
    keypoints: Optional[torch.FloatTensor] = None
    prune: Optional[torch.IntTensor] = None
    mask: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None


 class LightGlueImageProcessor(SuperGlueImageProcessor):
    def post_process_keypoint_matching(
        self,
        outputs: LightGlueKeypointMatchingOutput,
-        target_sizes: Union[TensorType, List[Tuple]],
+        target_sizes: Union[TensorType, list[tuple]],
        threshold: float = 0.0,
-    ) -> List[Dict[str, torch.Tensor]]:
+    ) -> list[dict[str, torch.Tensor]]:
        return super().post_process_keypoint_matching(outputs, target_sizes, threshold)

    def plot_keypoint_matching(self, images: ImageInput, keypoint_matching_output: LightGlueKeypointMatchingOutput):
@ -263,7 +263,7 @@ class LightGluePositionalEncoder(nn.Module):

    def forward(
        self, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False
-    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
        projected_keypoints = self.projector(keypoints)
        embeddings = projected_keypoints.repeat_interleave(2, dim=-1)
        cosines = torch.cos(embeddings)
@ -277,12 +277,12 @@ class LightGlueAttention(LlamaAttention):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -348,7 +348,7 @@ class LightGlueTransformerLayer(nn.Module):
        attention_mask: torch.Tensor,
        output_hidden_states: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor]], Optional[tuple[torch.Tensor]]]:
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

@ -509,7 +509,7 @@ class LightGluePreTrainedModel(PreTrainedModel):
            module.weight.data.fill_(1.0)


-def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> Tuple[torch.Tensor, torch.Tensor]:
+def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> tuple[torch.Tensor, torch.Tensor]:
    """obtain matches from a score matrix [Bx M+1 x N+1]"""
    batch_size, _, _ = scores.shape
    # For each keypoint, get the best match
@ -622,7 +622,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):

    def _keypoint_processing(
        self, descriptors: torch.Tensor, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False
-    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        descriptors = descriptors.detach().contiguous()
        projected_descriptors = self.input_projection(descriptors)
        keypoint_encoding_output = self.positional_encoder(keypoints, output_hidden_states=output_hidden_states)
@ -733,7 +733,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        matches: torch.Tensor,
        matching_scores: torch.Tensor,
        num_keypoints: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # (batch_size, num_keypoints) -> (batch_size // 2, 2, num_keypoints) -> 2 * (batch_size // 2, num_keypoints) to
        # have tensors from
        batch_size, _ = indices.shape
@ -773,7 +773,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        mask: torch.Tensor = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple, Tuple]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, tuple, tuple]:
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

@ -949,7 +949,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
-    ) -> Union[Tuple, LightGlueKeypointMatchingOutput]:
+    ) -> Union[tuple, LightGlueKeypointMatchingOutput]:
        loss = None
        if labels is not None:
            raise ValueError("LightGlue is not trainable, no labels should be provided.")