diff --git a/src/transformers/models/lightglue/convert_lightglue_to_hf.py b/src/transformers/models/lightglue/convert_lightglue_to_hf.py index c1cb2ce5870..feb7c790113 100644 --- a/src/transformers/models/lightglue/convert_lightglue_to_hf.py +++ b/src/transformers/models/lightglue/convert_lightglue_to_hf.py @@ -15,7 +15,6 @@ import argparse import gc import os import re -from typing import List import torch from datasets import load_dataset @@ -90,7 +89,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = { } -def convert_old_keys_to_new_keys(state_dict_keys: List[str]): +def convert_old_keys_to_new_keys(state_dict_keys: list[str]): """ This function should be applied only once, on the concatenated keys to efficiently rename using the key mappings. diff --git a/src/transformers/models/lightglue/image_processing_lightglue.py b/src/transformers/models/lightglue/image_processing_lightglue.py index fea0b32df33..ca9189210ba 100644 --- a/src/transformers/models/lightglue/image_processing_lightglue.py +++ b/src/transformers/models/lightglue/image_processing_lightglue.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import numpy as np import torch @@ -139,7 +139,7 @@ class LightGlueImageProcessor(BaseImageProcessor): do_resize (`bool`, *optional*, defaults to `True`): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by `do_resize` in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`): + size (`dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`): Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to `True`. Can be overridden by `size` in the `preprocess` method. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): @@ -159,7 +159,7 @@ class LightGlueImageProcessor(BaseImageProcessor): def __init__( self, do_resize: bool = True, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, do_rescale: bool = True, rescale_factor: float = 1 / 255, @@ -180,7 +180,7 @@ class LightGlueImageProcessor(BaseImageProcessor): def resize( self, image: np.ndarray, - size: Dict[str, int], + size: dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -191,7 +191,7 @@ class LightGlueImageProcessor(BaseImageProcessor): Args: image (`np.ndarray`): Image to resize. - size (`Dict[str, int]`): + size (`dict[str, int]`): Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the output image. If not provided, it will be inferred from the input @@ -220,7 +220,7 @@ class LightGlueImageProcessor(BaseImageProcessor): self, images, do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, resample: PILImageResampling = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, @@ -240,7 +240,7 @@ class LightGlueImageProcessor(BaseImageProcessor): `do_rescale=False`. do_resize (`bool`, *optional*, defaults to `self.do_resize`): Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): + size (`dict[str, int]`, *optional*, defaults to `self.size`): Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to @@ -337,23 +337,23 @@ class LightGlueImageProcessor(BaseImageProcessor): def post_process_keypoint_matching( self, outputs: LightGlueKeypointMatchingOutput, - target_sizes: Union[TensorType, List[Tuple]], + target_sizes: Union[TensorType, list[tuple]], threshold: float = 0.0, - ) -> List[Dict[str, torch.Tensor]]: + ) -> list[dict[str, torch.Tensor]]: """ Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors with coordinates absolute to the original image sizes. Args: outputs ([`KeypointMatchingOutput`]): Raw outputs of the model. - target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*): - Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the + target_sizes (`torch.Tensor` or `list[tuple[tuple[int, int]]]`, *optional*): + Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the batch. This must be the original image size (before any processing). threshold (`float`, *optional*, defaults to 0.0): Threshold to filter out the matches with low scores. Returns: - `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image + `list[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image of the pair, the matching scores and the matching indices. """ if outputs.mask.shape[0] != len(target_sizes): @@ -361,7 +361,7 @@ class LightGlueImageProcessor(BaseImageProcessor): if not all(len(target_size) == 2 for target_size in target_sizes): raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") - if isinstance(target_sizes, List): + if isinstance(target_sizes, list): image_pair_sizes = torch.tensor(target_sizes, device=outputs.mask.device) else: if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2: diff --git a/src/transformers/models/lightglue/modeling_lightglue.py b/src/transformers/models/lightglue/modeling_lightglue.py index 2cd8b0732f5..4df4888621e 100644 --- a/src/transformers/models/lightglue/modeling_lightglue.py +++ b/src/transformers/models/lightglue/modeling_lightglue.py @@ -18,7 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Union import numpy as np import torch @@ -74,8 +74,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput): keypoints: Optional[torch.FloatTensor] = None prune: Optional[torch.IntTensor] = None mask: Optional[torch.FloatTensor] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None class LightGluePositionalEncoder(nn.Module): @@ -85,7 +85,7 @@ class LightGluePositionalEncoder(nn.Module): def forward( self, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False - ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: projected_keypoints = self.projector(keypoints) embeddings = projected_keypoints.repeat_interleave(2, dim=-1) cosines = torch.cos(embeddings) @@ -200,12 +200,12 @@ class LightGlueAttention(nn.Module): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -274,7 +274,7 @@ class LightGlueTransformerLayer(nn.Module): attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor]], Optional[tuple[torch.Tensor]]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -435,7 +435,7 @@ class LightGluePreTrainedModel(PreTrainedModel): module.weight.data.fill_(1.0) -def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> Tuple[torch.Tensor, torch.Tensor]: +def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> tuple[torch.Tensor, torch.Tensor]: """obtain matches from a score matrix [Bx M+1 x N+1]""" batch_size, _, _ = scores.shape # For each keypoint, get the best match @@ -548,7 +548,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): def _keypoint_processing( self, descriptors: torch.Tensor, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: descriptors = descriptors.detach().contiguous() projected_descriptors = self.input_projection(descriptors) keypoint_encoding_output = self.positional_encoder(keypoints, output_hidden_states=output_hidden_states) @@ -659,7 +659,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): matches: torch.Tensor, matching_scores: torch.Tensor, num_keypoints: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # (batch_size, num_keypoints) -> (batch_size // 2, 2, num_keypoints) -> 2 * (batch_size // 2, num_keypoints) to # have tensors from batch_size, _ = indices.shape @@ -699,7 +699,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): mask: torch.Tensor = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple, Tuple]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, tuple, tuple]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -875,7 +875,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Union[Tuple, LightGlueKeypointMatchingOutput]: + ) -> Union[tuple, LightGlueKeypointMatchingOutput]: loss = None if labels is not None: raise ValueError("LightGlue is not trainable, no labels should be provided.") diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 482c230fb82..96a389194b4 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import numpy as np import torch @@ -196,17 +196,17 @@ class LightGlueKeypointMatchingOutput(ModelOutput): keypoints: Optional[torch.FloatTensor] = None prune: Optional[torch.IntTensor] = None mask: Optional[torch.FloatTensor] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None class LightGlueImageProcessor(SuperGlueImageProcessor): def post_process_keypoint_matching( self, outputs: LightGlueKeypointMatchingOutput, - target_sizes: Union[TensorType, List[Tuple]], + target_sizes: Union[TensorType, list[tuple]], threshold: float = 0.0, - ) -> List[Dict[str, torch.Tensor]]: + ) -> list[dict[str, torch.Tensor]]: return super().post_process_keypoint_matching(outputs, target_sizes, threshold) def plot_keypoint_matching(self, images: ImageInput, keypoint_matching_output: LightGlueKeypointMatchingOutput): @@ -263,7 +263,7 @@ class LightGluePositionalEncoder(nn.Module): def forward( self, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False - ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: projected_keypoints = self.projector(keypoints) embeddings = projected_keypoints.repeat_interleave(2, dim=-1) cosines = torch.cos(embeddings) @@ -277,12 +277,12 @@ class LightGlueAttention(LlamaAttention): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -348,7 +348,7 @@ class LightGlueTransformerLayer(nn.Module): attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = False, output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor]], Optional[tuple[torch.Tensor]]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -509,7 +509,7 @@ class LightGluePreTrainedModel(PreTrainedModel): module.weight.data.fill_(1.0) -def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> Tuple[torch.Tensor, torch.Tensor]: +def get_matches_from_scores(scores: torch.Tensor, threshold: float) -> tuple[torch.Tensor, torch.Tensor]: """obtain matches from a score matrix [Bx M+1 x N+1]""" batch_size, _, _ = scores.shape # For each keypoint, get the best match @@ -622,7 +622,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): def _keypoint_processing( self, descriptors: torch.Tensor, keypoints: torch.Tensor, output_hidden_states: Optional[bool] = False - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: descriptors = descriptors.detach().contiguous() projected_descriptors = self.input_projection(descriptors) keypoint_encoding_output = self.positional_encoder(keypoints, output_hidden_states=output_hidden_states) @@ -733,7 +733,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): matches: torch.Tensor, matching_scores: torch.Tensor, num_keypoints: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # (batch_size, num_keypoints) -> (batch_size // 2, 2, num_keypoints) -> 2 * (batch_size // 2, num_keypoints) to # have tensors from batch_size, _ = indices.shape @@ -773,7 +773,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): mask: torch.Tensor = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple, Tuple]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, tuple, tuple]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -949,7 +949,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel): labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Union[Tuple, LightGlueKeypointMatchingOutput]: + ) -> Union[tuple, LightGlueKeypointMatchingOutput]: loss = None if labels is not None: raise ValueError("LightGlue is not trainable, no labels should be provided.")