Fix typos in strings and comments (#37799)

2025-07-19 20:48:22 +06:00 · 2025-04-28 18:39:11 +08:00 · 2025-04-28 18:39:11 +08:00 · d5fa7d2d19
commit d5fa7d2d19
parent f466603963
136 changed files with 202 additions and 202 deletions
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
        fft_window_size (`int`, *optional*):
            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
-            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
-            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.
    Example:
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
                    ids_collect.append(beam_id)
-            # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
+            # due to overly complex constraints or other factors, sometimes we can't guarantee a successful
            # generation. In these cases we simply return the highest scoring outputs.
            if len(ids_collect) < self.num_beam_hyps_to_keep:
                for beam_id in range(self.num_beams):
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
            our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
            Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
-            it will be converted to its repsective `CacheConfig` internally.
+            it will be converted to its respective `CacheConfig` internally.
            Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
        return_legacy_cache (`bool`, *optional*, default to `True`):
            Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
        encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
-            The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
+            The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
            original input. 1.0 means no penalty.
        length_penalty (`float`, *optional*, defaults to 1.0):
            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
@ -385,7 +385,7 @@ class GenerationConfig(PushToHubMixin):
            inference.
        disable_compile (`bool`, *optional*):
            Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
-            specific criteria are met, including using a compileable cache. Please open an issue if you find the
+            specific criteria are met, including using a compilable cache. Please open an issue if you find the
            need to use this flag.
        > Wild card
@ -710,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
                    UserWarning,
                )
-        # 3. detect incorrect paramaterization specific to advanced beam modes
+        # 3. detect incorrect parameterization specific to advanced beam modes
        else:
            # constrained beam search
            if self.constraints is not None or self.force_words_ids is not None:
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
 class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
    r"""
-    [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
+    [`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
    `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
    beginning of the generation.
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -543,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
 class MinPLogitsWarper(LogitsProcessor):
    """
    [`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
-    probability of the most likely token. As a result, the filter becomes more agressive in the presence of
+    probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
    high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
    Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
@ -738,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):
    >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
    >>> # Top P sampling, which restricts tokens based on their cumulative probability.
-    >>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
+    >>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
    >>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -819,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):
    >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
    >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
-    >>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
+    >>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
    >>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -1348,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    Alice and Bob are friends
-    >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
+    >>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
    >>> # For instance, we can force an entire entity to be generated when its beginning is detected.
    >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0]  # 3 tokens
    >>> def prefix_allowed_tokens_fn(batch_id, input_ids):
@ -1791,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
    r"""
-    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
+    [`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
    not generated at the beginning. Originally created for
    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
@ -2642,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
        We assume that the scores are in the log space.
        Args:
            scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
-            g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
+            g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).
        Returns:
            Updated scores (batch_size, vocab_size).
@ -2668,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
        if self.debug_mode:
            scores = torch.ones_like(scores)
-        # Currently indices is just a arange to compute watermarking on the desnse logits.
+        # Currently indices is just a arange to compute watermarking on the dense logits.
        all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
        if self.state is None:
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
                )
            def _match_found():
-                # Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
+                # Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
                # an answer (otherwise we get indexing exceptions)
                compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
                return tf.cond(
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@ -962,7 +962,7 @@ class TFGenerationMixin:
                raise ValueError(
                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                    f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                )
            # 11. broadcast inputs to the desired number of beams
@ -994,7 +994,7 @@ class TFGenerationMixin:
                raise ValueError(
                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                    f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                )
            # 11. prepare logits warper
@ -1626,7 +1626,7 @@ class TFGenerationMixin:
        )
        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -1910,7 +1910,7 @@ class TFGenerationMixin:
        )
        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2253,7 +2253,7 @@ class TFGenerationMixin:
        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2788,7 +2788,7 @@ class TFGenerationMixin:
        model_kwargs.pop("use_cache", None)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -362,7 +362,7 @@ class GenerationMixin:
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
-            `BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
+            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
@ -392,7 +392,7 @@ class GenerationMixin:
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        - Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.
        The current implementation does not rely on ``self`` and could be
@ -967,7 +967,7 @@ class GenerationMixin:
                    assistant_model=assistant_model,
                    assistant_prune_lm_head=True,  # prune LM head of assistant model
                )
-                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
+                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
                assistant_model.generation_config.repetition_penalty = None
                candidate_generator = UniversalSpeculativeDecodingGenerator(
                    input_ids=input_ids,
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
    ```
    """
-    # get ALL model parameters and thier names
+    # get ALL model parameters and their names
    all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
    # get ONLY unique named parameters,
@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
    for tied_param_name in tied_param_names:
        tied_param = all_named_parameters[tied_param_name]
        for param_name, param in no_duplicate_named_parameters.items():
-            # compare if parameters are the same, if so, group thier names together
+            # compare if parameters are the same, if so, group their names together
            if param is tied_param:
                if param_name not in tied_param_groups:
                    tied_param_groups[param_name] = []
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -329,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
        This util function is designed to test exported models by simulating the generation process.
        It processes the input prompt tokens sequentially (no parallel prefill).
        This generate function is not intended to replace the original `generate` method, and the support
-        for leveraging the original `generate` is potentially planed!
+        for leveraging the original `generate` is potentially planned!
        Args:
            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@ -28,7 +28,7 @@ def autoname_modules(model):
        module.name = name
-# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
+# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
 def name_to_linear_tag(name):
    return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
    """
    Prepares nn.Linear layers for HQQ quantization.
    Since each layer type can have separate quantization parameters, we need to do the following:
-    1- tag each module with its neme via autoname_modules()
+    1- tag each module with its name via autoname_modules()
    2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
-    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
+    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
    """
    modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@ -160,7 +160,7 @@ def distribute_module(
    output_fn=None,
 ) -> nn.Module:
    """
-    Copy pasted from torch's function but we remove the communications (partitionning)
+    Copy pasted from torch's function but we remove the communications (partitioning)
    as well as buffer registering that is similarly not efficient.
    """
    if len(module._forward_pre_hooks) == 0:
@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):
    @staticmethod
    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-        # this op cannot be asynch, otherwise it completely breaks the outputs of models
+        # this op cannot be async, otherwise it completely breaks the outputs of models
        torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
        return outputs
--- a/src/transformers/loss/loss_for_object_detection.py
+++ b/src/transformers/loss/loss_for_object_detection.py
@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):
        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
        class_cost = -out_prob[:, target_ids]
        # Compute the L1 cost between boxes
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
        target_bbox = torch.cat([v["boxes"] for v in targets])
        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
        if self.use_focal_loss:
            out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
            out_prob = out_prob[:, target_ids]
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@ -593,7 +593,7 @@ class AlignVisionBlock(nn.Module):
 class AlignVisionEncoder(nn.Module):
    r"""
-    Forward propogates the embeddings through each vision encoder (EfficientNet) block.
+    Forward propagates the embeddings through each vision encoder (EfficientNet) block.
    Args:
        config ([`AlignVisionConfig`]):
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@ -36,7 +36,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
 class AlignProcessor(ProcessorMixin):
    r"""
    Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
-    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
+    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
    tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@ -1936,7 +1936,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
        params = None
        if future_values is not None:
            # outputs.last_hidden_state and trend
-            # loc is 4rd last and scale is 3rd last output
+            # loc is 4th last and scale is 3rd last output
            params = self.output_params(outputs[0] + outputs[1])
            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
@ -164,7 +164,7 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
    new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
    print(new_model.eval())
-    print("Model conversion was done sucessfully!")
+    print("Model conversion was done successfully!")
 if __name__ == "__main__":
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@ -235,7 +235,7 @@ class Blip2Config(PretrainedConfig):
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.
        image_text_hidden_size (`int`, *optional*, defaults to 256):
-            Dimentionality of the hidden state of the image-text fusion layer.
+            Dimensionality of the hidden state of the image-text fusion layer.
        image_token_index (`int`, *optional*):
            Token index of special image token.
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@ -899,7 +899,7 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
        use_cache=True,
        **kwargs,
    ):
-        # Overwriten because of the fixed-shape attention mask creation
+        # Overwritten because of the fixed-shape attention mask creation
        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@ -49,17 +49,17 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
        hop_length (`int`,*optional*, defaults to 480):
-            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
            in smaller `frames` with a step of `hop_length` between each frame.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input length of the model in seconds. This is used to pad the audio.
        fft_window_size (`int`, *optional*, defaults to 1024):
            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
-            resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
+            resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the attention masks coresponding to the input.
+            Whether or not the model should return the attention masks corresponding to the input.
        frequency_min (`float`, *optional*, defaults to 0):
            The lowest frequency of interest. The STFT will not be computed for values below this.
        frequency_max (`float`, *optional*, defaults to 14000):
@ -141,7 +141,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
        Serializes this instance to a Python dictionary.
        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
            mel filter banks, which do not need to be saved or printed as they are too long.
        """
        output = copy.deepcopy(self.__dict__)
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@ -1067,7 +1067,7 @@ CLAP_TEXT_INPUTS_DOCSTRING = r"""
 CLAP_AUDIO_INPUTS_DOCSTRING = r"""
    Args:
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
@ -1105,7 +1105,7 @@ CLAP_INPUTS_DOCSTRING = r"""
            [What are position IDs?](../glossary#position-ids)
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@ -127,7 +127,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
        if kwargs.pop("add_bos_token", False):
            model_id = kwargs.pop("name_or_path", "")
            raise ValueError(
-                "Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
+                "Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
                "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
                f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
                f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@ -277,7 +277,7 @@ def final():
 def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
    """
-    Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
+    Function to convert the microsoft cvt checkpoint to huggingface checkpoint
    """
    img_labels_file = "imagenet-1k-id2label.json"
    num_labels = 1000
--- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
@ -58,7 +58,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
    # activation function weight
    r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
    #########################################################################################################################################
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activation function weight
    r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
    r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
    # FFNs
@ -144,7 +144,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
    config.label2id = {v: k for k, v in id2label.items()}
    # load original model from local path
    loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
-    # Renaming the original model state dictionary to HF compatibile
+    # Renaming the original model state dictionary to HF compatible
    all_keys = list(loaded.keys())
    new_keys = convert_old_keys_to_new_keys(all_keys)
    state_dict = {}
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@ -1297,7 +1297,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
    ):
        """
        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
-        set fro each configuration.
+        set for each configuration.
        Args:
            config (`JukeboxPriorConfig`):
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@ -142,7 +142,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
        return patches
    if n_patches_per_batch < 4:
-        # for each batch, atleast 4 small patches are required to
+        # for each batch, at least 4 small patches are required to
        # recreate a large square patch from merging them and later padding is applied
        # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
        # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@ -118,7 +118,7 @@ class Embeddings(nn.Module):
        # Setting the position-ids to the registered buffer in constructor, it helps
        # when tracing the model without passing position-ids, solves
-        # isues similar to issue #5664
+        # issues similar to issue #5664
        if hasattr(self, "position_ids"):
            position_ids = self.position_ids[:, :seq_length]
        else:
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@ -72,7 +72,7 @@ class DonutImageProcessor(BaseImageProcessor):
            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
-            random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+            random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
            padded to the largest image size in the batch.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
@ -349,7 +349,7 @@ class DonutImageProcessor(BaseImageProcessor):
                Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
-                amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+                amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
                padded to the largest image size in the batch.
            random_padding (`bool`, *optional*, defaults to `self.random_padding`):
                Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@ -142,7 +142,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
            speaker id embedding layer.
        num_languages (`int`, *optional*):
            Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
-            languge id embedding layer.
+            language id embedding layer.
        speaker_embed_dim (`int`, *optional*):
            Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@ -391,7 +391,7 @@ class FastSpeech2ConformerVariancePredictor(nn.Module):
        dropout_rate=0.5,
    ):
        """
-        Initilize variance predictor module.
+        Initialize variance predictor module.
        Args:
            input_dim (`int`): Input dimension.
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@ -948,7 +948,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
        # Setting the position-ids to the registered buffer in constructor, it helps
        # when tracing the model without passing position-ids, solves
-        # isues similar to issue #5664
+        # issues similar to issue #5664
        if position_ids is None:
            if hasattr(self, "position_ids"):
                position_ids = self.position_ids[:, :slen]
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@ -360,7 +360,7 @@ class FocalNetModulation(nn.Module):
        x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
        q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
-        # context aggreation
+        # context aggregation
        ctx_all = 0
        for level in range(self.focal_level):
            ctx = self.focal_layers[level](ctx)
@ -379,7 +379,7 @@ class FocalNetModulation(nn.Module):
        if self.use_post_layernorm_in_modulation:
            x_out = self.layernorm(x_out)
-        # post linear porjection
+        # post linear projection
        x_out = self.projection_out(x_out)
        x_out = self.projection_dropout(x_out)
        return x_out
@ -415,7 +415,7 @@ class FocalNetLayer(nn.Module):
        dim (`int`):
            Number of input channels.
        input_resolution (`Tuple[int]`):
-            Input resulotion.
+            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    """
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@ -244,7 +244,7 @@ def _tokenize_prompts_with_image_and_batch(
    - pad all the sequences to this length so we can convert them into a 3D tensor.
    """
-    # If not tool use, tranform the coordinates while tokenizing
+    # If not tool use, transform the coordinates while tokenizing
    if scale_factors is not None:
        transformed_prompt_tokens = []
        for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
--- a/src/transformers/models/gemma3/configuration_gemma3.py
+++ b/src/transformers/models/gemma3/configuration_gemma3.py
@ -96,7 +96,7 @@ class Gemma3TextConfig(PretrainedConfig):
            Scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@ -140,7 +140,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
    ):
        """
        Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
-        minumum allowed ratio.
+        minimum allowed ratio.
        Args:
            image (`np.ndarray`):
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@ -108,7 +108,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
    ):
        """
        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
-        minumum allowed ratio.
+        minimum allowed ratio.
        Args:
            image (`torch.Tensor`):
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@ -1270,7 +1270,7 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
        is_training = token_type_ids is not None and labels is not None
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
            special_image_mask = input_ids == self.config.image_token_id
            llm_input_ids = input_ids.clone()
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@ -128,7 +128,7 @@ class Gemma3TextConfig(Gemma2Config):
            Scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
@ -926,7 +926,7 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
        is_training = token_type_ids is not None and labels is not None
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
            special_image_mask = input_ids == self.config.image_token_id
            llm_input_ids = input_ids.clone()
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@ -1495,7 +1495,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
-        >>> # set seed for reproducability
+        >>> # set seed for reproducibility
        >>> np.random.seed(45)
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@ -199,7 +199,7 @@ class GPTNeoXConfig(PretrainedConfig):
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
-                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
+                "The hidden size is not divisible by the number of attention heads! Make sure to update them!"
            )
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@ -402,7 +402,7 @@ def convert_grounding_dino_checkpoint(args):
        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
    }
-    # Define default GroundingDino configuation
+    # Define default GroundingDino configuration
    config = get_grounding_dino_config(model_name)
    # Load original checkpoint
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@ -1850,7 +1850,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
            # In original implementation they apply layer norm before outputting intermediate hidden states
            # Though that's not through between layers so the layers use as input the output of the previous layer
-            # withtout layer norm
+            # without layer norm
            if output_hidden_states:
                all_hidden_states += (self.layer_norm(hidden_states),)
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@ -1425,7 +1425,7 @@ HUBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
    HUBERT_START_DOCSTRING,
 )
 class TFHubertModel(TFHubertPreTrainedModel):
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@ -74,8 +74,8 @@ class IBertConfig(PretrainedConfig):
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether to quantize the model or not.
        force_dequant (`str`, *optional*, defaults to `"none"`):
-            Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
+            Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
-            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As deafult, it is set as
+            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
            `"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
            dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
            i.e., GELU, Softmax, and LayerNorm.
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@ -276,7 +276,7 @@ class InternVLProcessor(ProcessorMixin):
        Args:
            metadata (`VideoMetadata`):
-                `VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
+                `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
            num_frames (`int`, *optional*):
                Number of frames to sample uniformly. If None, all frames are sampled.
            initial_shift (`bool`, `float` or `int`, defaults to `0`):
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@ -246,7 +246,7 @@ class LevitAttentionSubsample(nn.Module):
        self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
        self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
        self.resolution_out = resolution_out
-        # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
        self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
        self.queries_subsample = LevitSubsample(stride, resolution_in)
        self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
@ -370,7 +370,7 @@ class LevitStage(nn.Module):
        self.layers = []
        self.config = config
        self.resolution_in = resolution_in
-        # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
        for _ in range(depths):
            self.layers.append(
                LevitResidualLayer(
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@ -55,7 +55,7 @@ if is_torchvision_available():
 def get_factors(dividend: int) -> Set[int]:
    """
-    Calculate all factors of a given number, i.e. a dividor that leaves
+    Calculate all factors of a given number, i.e. a divisor that leaves
    no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
    Args:
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@ -60,7 +60,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
        image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
            A list of possible resolutions to use for processing high resolution images. The best resolution is selected
            based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-            method. Not used for processinf videos.
+            method. Not used for processing videos.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@ -405,7 +405,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
        """
        super().__init__()
        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
-            raise ValueError("All costs cant be 0")
+            raise ValueError("All costs can't be 0")
        self.num_points = num_points
        self.cost_class = cost_class
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@ -829,7 +829,7 @@ class MaskFormerHungarianMatcher(nn.Module):
        """
        super().__init__()
        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
-            raise ValueError("All costs cant be 0")
+            raise ValueError("All costs can't be 0")
        self.cost_class = cost_class
        self.cost_mask = cost_mask
        self.cost_dice = cost_dice
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@ -98,7 +98,7 @@ def add_megatron_checkpoint_args(parser):
        default=128,
        help=(
            "Pad the vocab size to be divisible by this value. "
-            "This is added for computational efficieny reasons. "
+            "This is added for computational efficiency reasons. "
            "Only used when converting a Transformers checkpoint to a Megatron checkpoint."
        ),
    )
@ -235,7 +235,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
    param, checkpoint_version, num_splits, num_heads, hidden_size
 ):
    """
-    Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
+    Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM checkpoint versions. Input
    is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
    1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
    self-attention block, the param needs to be already transposed before calling this function.
@ -348,7 +348,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
        raise ValueError(
            "Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
            " containing all the megatron arguments. This is because it loads all config related to model"
-            " architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to"
+            " architecture, the tensor and pipeline model parallel size from the checkpoint instead of user having to"
            " manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
            " arguments to use this utility."
        )
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@ -1601,7 +1601,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
        # 7. determine generation mode
        generation_mode = generation_config.get_generation_mode()
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
            generation_config.guidance_scale = None
@ -2617,7 +2617,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
        # 7. determine generation mode
        generation_mode = generation_config.get_generation_mode()
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
            generation_config.guidance_scale = None
--- a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
@ -54,7 +54,7 @@ class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
        sampling_rate (`int`, *optional*, defaults to 32000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 4096):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@ -92,7 +92,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-            Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
+            Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
            Used as a conditional signal.
    """
@ -757,8 +757,8 @@ MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-            Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
+            Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
-            Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
+            Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
@ -818,7 +818,7 @@ MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
-            Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
+            Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing attention on conditional hidden states. Mask values
            selected in `[0, 1]`:
@ -1522,7 +1522,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
        # 7. determine generation mode
        generation_mode = generation_config.get_generation_mode()
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
            generation_config.guidance_scale = None
@ -2478,7 +2478,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
        # 7. determine generation mode
        generation_mode = generation_config.get_generation_mode()
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
            generation_config.guidance_scale = None
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@ -425,7 +425,7 @@ class NllbMoeSparseMLP(nn.Module):
        r"""
        The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
        (mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
-        top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
+        top_2 gating mechanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
        instead of O(num_experts x batch_size x sequence_length x hidden_dim).
        1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@ -376,7 +376,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
-            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
            spaces.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
--- a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
+++ b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
@ -268,7 +268,7 @@ def convert_omdet_turbo_checkpoint(args):
            "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
        ],
    }
-    # Define default OmDetTurbo configuation
+    # Define default OmDetTurbo configuration
    config = get_omdet_turbo_config(model_name, use_timm_backbone)
    # Load original checkpoint
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@ -471,7 +471,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
        is_training = token_type_ids is not None and labels is not None
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
            special_image_mask = input_ids == self.config.image_token_id
            llm_input_ids = input_ids.clone()
--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@ -1807,7 +1807,7 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
    Args:
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
-            Prediction output from the classfication head.
+            Prediction output from the classification head.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
            Backbone embeddings before passing through the head.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@ -1487,7 +1487,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
    Summarization example:
-    ```pyton
+    ```python
    >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
    >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@ -127,7 +127,7 @@ def get_resize_output_image_size(
    ratio = max(height / max_height, width / max_width)
    if ratio > 1:
-        # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+        # Original implementation uses `round` which utilises bankers rounding, which can lead to surprising results
        # Here we use floor to ensure the image is always smaller than the given "longest_edge"
        height = int(math.floor(height / ratio))
        width = int(math.floor(width / ratio))
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
 def create_rename_keys(config):
    rename_keys = []
    for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
+        # Rename embeddings' parameters
        rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
        rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@ -1037,7 +1037,7 @@ REMBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RemBERT Model transformer outputting raw hidden-states without any specific head on top.",
    REMBERT_START_DOCSTRING,
 )
 class TFRemBertModel(TFRemBertPreTrainedModel):
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@ -911,7 +911,7 @@ ROFORMER_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
    ROFORMER_START_DOCSTRING,
 )
 class TFRoFormerModel(TFRoFormerPreTrainedModel):
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@ -2171,7 +2171,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
        config: SeamlessM4TConfig,
        embed_tokens_decoder: Optional[nn.Embedding] = None,
    ):
-        # update config - used principaly for bos_token_id etc.
+        # update config - used principality for bos_token_id etc.
        config = copy.deepcopy(config)
        for param, val in config.to_dict().items():
            if param.startswith("t2u_"):
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@ -184,7 +184,7 @@ SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
            [What are input IDs?](../glossary#input-ids)
        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
            [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
    """
@ -202,7 +202,7 @@ M4T_TEXT_INPUTS_DOCSTRING = r"""
 M4T_SPEECH_INPUTS_DOCSTRING = r"""
    Args:
        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
            [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
        """
@ -2461,7 +2461,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
        config: SeamlessM4Tv2Config,
        embed_tokens_decoder: Optional[nn.Embedding] = None,
    ):
-        # update config - used principaly for bos_token_id etc.
+        # update config - used principality for bos_token_id etc.
        config = copy.deepcopy(config)
        for param, val in config.to_dict().items():
            if param.startswith("t2u_"):
@ -4035,7 +4035,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
            return_intermediate_token_ids (`bool`, *optional*):
                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
@ -4485,7 +4485,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
                [What are input IDs?](../glossary#input-ids)
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
            return_intermediate_token_ids (`bool`, *optional*):
                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
--- a/src/transformers/models/seggpt/convert_seggpt_to_hf.py
+++ b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
@ -114,7 +114,7 @@ def convert_seggpt_checkpoint(args):
    verify_logits = args.verify_logits
    push_to_hub = args.push_to_hub
-    # Define default GroundingDINO configuation
+    # Define default GroundingDINO configuration
    config = SegGptConfig()
    # Load original checkpoint
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@ -62,7 +62,7 @@ class SegGptEncoderOutput(ModelOutput):
        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
            Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
            Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
-            Additionaly, each feature passes through a LayerNorm.
+            Additionally, each feature passes through a LayerNorm.
    """
    last_hidden_state: torch.FloatTensor
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@ -1979,10 +1979,10 @@ SPEECHT5_BASE_START_DOCSTRING = r"""
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
-            The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
+            The Transformer encoder module that applies the appropriate speech or text encoder prenet. If `None`,
            [`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
        decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
-            The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
+            The Transformer decoder module that applies the appropriate speech or text decoder prenet. If `None`,
            [`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
            states.
 """
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
@ -175,7 +175,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
-    """Replaces the params in model witht the T5X converted params."""
+    """Replaces the params in model with the T5X converted params."""
    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
    converted = convert_t5x_to_pytorch(
        variables,
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@ -2344,11 +2344,11 @@ def _calculate_expected_result(
    if avg_approximation == AverageApproximationFunction.RATIO:
        average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
    elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
-        # The sum of all probabilities exept that correspond to other cells
+        # The sum of all probabilities except that correspond to other cells
        ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
        average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
    elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
-        # The sum of all probabilities exept that correspond to other cells
+        # The sum of all probabilities except that correspond to other cells
        ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
        pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
        var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@ -2359,7 +2359,7 @@ _ORDINAL_WORDS = [
    "second",
    "third",
    "fourth",
-    "fith",
+    "fifth",
    "sixth",
    "seventh",
    "eighth",
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@ -1364,7 +1364,7 @@ class UdopStack(UdopPreTrainedModel):
        if inputs_embeds is None:
            if self.embed_tokens is None:
-                raise ValueError("You have to intialize the model with valid token embeddings")
+                raise ValueError("You have to initialize the model with valid token embeddings")
            inputs_embeds = self.embed_tokens(input_ids)
        if pixel_values is not None:
--- a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+++ b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
@ -200,7 +200,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
-    """Replaces the params in model witht the T5X converted params."""
+    """Replaces the params in model with the T5X converted params."""
    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
    converted = convert_t5x_to_pytorch(
        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@ -164,7 +164,7 @@ class UniSpeechConfig(PretrainedConfig):
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        replace_prob (`float`, *optional*, defaults to 0.5):
-            Propability that transformer feature is replaced by quantized feature for pretraining.
+            Probability that transformer feature is replaced by quantized feature for pretraining.
    Example:
--- a/src/transformers/models/univnet/convert_univnet.py
+++ b/src/transformers/models/univnet/convert_univnet.py
@ -56,7 +56,7 @@ def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = ""
 def get_key_mapping(config: UnivNetConfig):
    mapping = {}
-    # NOTE: inital conv layer keys are the same
+    # NOTE: initial conv layer keys are the same
    # LVC Residual blocks
    for i in range(len(config.resblock_stride_sizes)):
--- a/src/transformers/models/univnet/feature_extraction_univnet.py
+++ b/src/transformers/models/univnet/feature_extraction_univnet.py
@ -64,7 +64,7 @@ class UnivNetFeatureExtractor(SequenceFeatureExtractor):
            The number of FFT components to use. If `None`, this is determined using
            `transformers.audio_utils.optimal_fft_length`.
        max_length_s (`int`, *optional*, defaults to 10):
-            The maximum input lenght of the model in seconds. This is used to pad the audio.
+            The maximum input length of the model in seconds. This is used to pad the audio.
        fmin (`float`, *optional*, defaults to 0.0):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*):
--- a/src/transformers/models/vilt/configuration_vilt.py
+++ b/src/transformers/models/vilt/configuration_vilt.py
@ -39,7 +39,7 @@ class ViltConfig(PretrainedConfig):
            The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
            text.
        modality_type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatening the
+            The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatenating the
            embeddings of the text and image modalities.
        max_position_embeddings (`int`, *optional*, defaults to 40):
            The maximum sequence length that this model might ever be used with.
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@ -139,7 +139,7 @@ class ViltEmbeddings(nn.Module):
        x_mask = x_mask.flatten(1)
        if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
-            # suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrinked)
+            # suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrunk)
            # (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
            # if self.patch_size = 32, 25 * 41 = 1025
            # if res is 384 x 640, 12 * 20 = 240
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@ -85,7 +85,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
        super().__init__(**kwargs)
        if "encoder" not in kwargs or "decoder" not in kwargs:
            raise ValueError(
-                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"A configuration of type {self.model_type} cannot be instantiated because "
                f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
            )
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@ -1496,7 +1496,7 @@ WAV2VEC2_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
-    "The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
    WAV2VEC2_START_DOCSTRING,
 )
 class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@ -101,7 +101,7 @@ class WavLMConfig(PretrainedConfig):
            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
            Recognition](https://arxiv.org/abs/1904.08779).
        mask_time_prob (`float`, *optional*, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
            masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
            along the time axis. This is only relevant if `apply_spec_augment is True`.
        mask_time_length (`int`, *optional*, defaults to 10):
@ -111,7 +111,7 @@ class WavLMConfig(PretrainedConfig):
            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
            mask_time_min_masks''
        mask_feature_prob (`float`, *optional*, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
            be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
            along the time axis. This is only relevant if `apply_spec_augment is True`.
        mask_feature_length (`int`, *optional*, defaults to 10):
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@ -474,7 +474,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
            outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
                Raw outputs of the model from flipped input (averaged out in the end).
            do_remove_padding (`bool`, *optional*):
-                By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
+                By default ZoeDepth adds padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
                boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
                parameter exists here in case the user changed the image preprocessing to not include padding.
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@ -37,7 +37,7 @@ class AwqQuantizer(HfQuantizer):
    4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
    """
-    # AWQ requires data callibration - we support only inference
+    # AWQ requires data calibration - we support only inference
    requires_calibration = True
    required_packages = ["awq", "accelerate"]
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@ -69,7 +69,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
            return missing_keys
        # We expect some keys to be missing for
-        # compresed models
+        # compressed models
        # This is fine as the weights are reconstructed by ModelCompressor
        # in _process_model_after_weight_loading
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -1728,14 +1728,14 @@ Please note that you may need to restart your runtime after installation.
 # docstyle-ignore
 LIBROSA_IMPORT_ERROR = """
-{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
+{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
 `pip install librosa`
 Please note that you may need to restart your runtime after installation.
 """
 # docstyle-ignore
 PRETTY_MIDI_IMPORT_ERROR = """
-{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
+{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
 `pip install pretty_midi`
 Please note that you may need to restart your runtime after installation.
 """
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@ -1120,7 +1120,7 @@ class VptqLayerConfig(QuantizationConfigMixin):
        group_size (`int`, *optional*, defaults to `-1`): depends on out-features
        indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
        is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
-        num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
+        num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centroid numbers of clusters
        num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
        outlier_size (`int`, *optional*, defaults to `1`): outliers
        vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@ -146,7 +146,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def test_tokenization_base_hard_symbols(self):
        symbols = (
            'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
        )
        original_tokenizer_encodings = [
            871,
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@ -170,7 +170,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def test_tokenization_base_hard_symbols(self):
        symbols = (
            'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
        )
        original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66]  # fmt: skip
        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@ -438,7 +438,7 @@ class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
        if self.has_attentions:
            self.assertIsNotNone(attentions.grad)
-    # override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
+    # override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
--- a/tests/models/bridgetower/test_processor_bridgetower.py
+++ b/tests/models/bridgetower/test_processor_bridgetower.py
@ -55,7 +55,7 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
-    # Some kwargs tests are overriden from common tests to handle shortest_edge
+    # Some kwargs tests are overridden from common tests to handle shortest_edge
    # and size_divisor behaviour
    @require_torch
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@ -924,7 +924,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_model_get_set_embeddings(self):
        pass
-    # override as the `logit_scale` parameter initilization is different for FLAVA
+    # override as the `logit_scale` parameter initialization is different for FLAVA
    def test_initialization(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@ -933,7 +933,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
            model = model_class(config=configs_no_init)
            for name, param in model.named_parameters():
                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                    if name == "logit_scale" or name == "flava.logit_scale":
                        self.assertAlmostEqual(
                            param.data.item(),
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@ -137,7 +137,7 @@ class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
    @unittest.skip(
        reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
-        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+        " as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
    )
    def test_multi_gpu_data_parallel_forward(self):
        pass
@ -275,7 +275,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
    @unittest.skip(
        reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
-        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+        " as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
    )
    def test_multi_gpu_data_parallel_forward(self):
        pass
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@ -88,7 +88,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image = self.prepare_image_inputs()
-        # If text has no image tokens, iamge should be `None`
+        # If text has no image tokens, image should be `None`
        with self.assertRaises(ValueError):
            _ = processor(text=text_no_image, images=image, return_tensors="np")
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@ -478,8 +478,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        # the last 2 tokens are masked, and should have 0 attn_probs
        self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
-        # in loacal attention each token can only attend to the previous window_size tokens (including itself)
+        # in local attention each token can only attend to the previous window_size tokens (including itself)
-        # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
+        # here window_size is 4, so a token at index 5 can only attend to indices [2, 3, 4, 5]
        # and the attn_probs should be 0 for token [0, 1]
        self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
        self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@ -769,7 +769,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
        encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
        encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
        # If we batch the text and cross attention masking is working the batched result should be equal to
-        # The singe text result
+        # The single text result
        encoding_batched = processor(
            images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
        ).to(torch_device)
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@ -658,7 +658,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
    def test_sdpa_can_dispatch_composite_models(self):
        """
        Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
        In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
        is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
        See https://github.com/huggingface/transformers/pull/32238 for more info
--- a/tests/models/llava/test_configuration_llava.py
+++ b/tests/models/llava/test_configuration_llava.py
@ -56,7 +56,7 @@ class LlavaConfigTest(unittest.TestCase):
    def test_arbitrary_reload(self):
        """
-        Simple test for reloading arbirarily composed subconfigs
+        Simple test for reloading arbitrarily composed subconfigs
        """
        default_values = LlavaConfig().to_diff_dict()
        default_values["vision_config"]["model_type"] = "pixtral"
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@ -553,8 +553,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
        # image = Image.open(requests.get(url, stream=True).raw)
        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
        generate_ids = model.generate(**inputs, max_new_tokens=500)
-        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        print(ouptut)
+        print(output)
        # fmt: off
        EXPECTED_GENERATION = """
@ -573,7 +573,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
 """
        # fmt: on
        # check that both inputs are handled correctly and generate the same output
-        self.assertEqual(ouptut, EXPECTED_GENERATION)
+        self.assertEqual(output, EXPECTED_GENERATION)
    @slow
    @require_bitsandbytes
--- a/Show More
+++ b/Show More