diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 0084709c775..fced0e5758f 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
             tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
         fft_window_size (`int`, *optional*):
             Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
-            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
             frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
-            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.
 
     Example:
 
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 3938deb4826..f5850a864c0 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
                     beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
                     ids_collect.append(beam_id)
 
-            # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
+            # due to overly complex constraints or other factors, sometimes we can't guarantee a successful
             # generation. In these cases we simply return the highest scoring outputs.
             if len(ids_collect) < self.num_beam_hyps_to_keep:
                 for beam_id in range(self.num_beams):
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 100b11fc748..12f612bd26c 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
             our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
         cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
             Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
-            it will be converted to its repsective `CacheConfig` internally.
+            it will be converted to its respective `CacheConfig` internally.
             Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
         return_legacy_cache (`bool`, *optional*, default to `True`):
             Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
@@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
         encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
-            The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
+            The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
             original input. 1.0 means no penalty.
         length_penalty (`float`, *optional*, defaults to 1.0):
             Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
@@ -385,7 +385,7 @@ class GenerationConfig(PushToHubMixin):
             inference.
         disable_compile (`bool`, *optional*):
             Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
-            specific criteria are met, including using a compileable cache. Please open an issue if you find the
+            specific criteria are met, including using a compilable cache. Please open an issue if you find the
             need to use this flag.
 
         > Wild card
@@ -710,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
                     UserWarning,
                 )
 
-        # 3. detect incorrect paramaterization specific to advanced beam modes
+        # 3. detect incorrect parameterization specific to advanced beam modes
         else:
             # constrained beam search
             if self.constraints is not None or self.force_words_ids is not None:
diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index d106c32defa..fdb0e7a26f6 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
 
 class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
     r"""
-    [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
+    [`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
     `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
     beginning of the generation.
 
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 352fff9e637..34c7ea532cc 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -543,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
 class MinPLogitsWarper(LogitsProcessor):
     """
     [`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
-    probability of the most likely token. As a result, the filter becomes more agressive in the presence of
+    probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
     high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
 
     Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
@@ -738,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):
 
     >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
     >>> # Top P sampling, which restricts tokens based on their cumulative probability.
-    >>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
+    >>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
     >>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
     >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
     A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@@ -819,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):
 
     >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
     >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
-    >>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
+    >>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
     >>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
     >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
     A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@@ -1348,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
     Alice and Bob are friends
 
-    >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
+    >>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
     >>> # For instance, we can force an entire entity to be generated when its beginning is detected.
     >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0]  # 3 tokens
     >>> def prefix_allowed_tokens_fn(batch_id, input_ids):
@@ -1791,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):
 
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     r"""
-    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
+    [`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
     generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
     not generated at the beginning. Originally created for
     [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
@@ -2642,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
         We assume that the scores are in the log space.
         Args:
             scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
-            g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
+            g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).
 
         Returns:
             Updated scores (batch_size, vocab_size).
@@ -2668,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
         if self.debug_mode:
             scores = torch.ones_like(scores)
 
-        # Currently indices is just a arange to compute watermarking on the desnse logits.
+        # Currently indices is just a arange to compute watermarking on the dense logits.
         all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
 
         if self.state is None:
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index f70655fb7c1..8302319d981 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
                 )
 
             def _match_found():
-                # Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
+                # Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
                 # an answer (otherwise we get indexing exceptions)
                 compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
                 return tf.cond(
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index b2da6f18baf..510186cafc0 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -962,7 +962,7 @@ class TFGenerationMixin:
                 raise ValueError(
                     "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                     f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                 )
 
             # 11. broadcast inputs to the desired number of beams
@@ -994,7 +994,7 @@ class TFGenerationMixin:
                 raise ValueError(
                     "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                     f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                 )
 
             # 11. prepare logits warper
@@ -1626,7 +1626,7 @@ class TFGenerationMixin:
         )
         use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
         use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
         cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@@ -1910,7 +1910,7 @@ class TFGenerationMixin:
         )
         use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
         use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
         cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@@ -2253,7 +2253,7 @@ class TFGenerationMixin:
 
         use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
         use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
         cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@@ -2788,7 +2788,7 @@ class TFGenerationMixin:
         model_kwargs.pop("use_cache", None)
 
         use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
         cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 995b556b7ea..f2733ce9cb1 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -362,7 +362,7 @@ class GenerationMixin:
            inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
         - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
             However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
-            `BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
+            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
 
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* if `num_beams=1` and `do_sample=False`
@@ -392,7 +392,7 @@ class GenerationMixin:
         - Exception 1: when passing input_embeds, input_ids may be missing entries
         - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
         - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        - Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
           generate the first token for each sequence. Later use the generated Input ids for continuation.
 
         The current implementation does not rely on ``self`` and could be
@@ -967,7 +967,7 @@ class GenerationMixin:
                     assistant_model=assistant_model,
                     assistant_prune_lm_head=True,  # prune LM head of assistant model
                 )
-                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
+                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
                 assistant_model.generation_config.repetition_penalty = None
                 candidate_generator = UniversalSpeculativeDecodingGenerator(
                     input_ids=input_ids,
diff --git a/src/transformers/integrations/accelerate.py b/src/transformers/integrations/accelerate.py
index 83efac9661a..5dbff5e9b04 100644
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
     ```
     """
 
-    # get ALL model parameters and thier names
+    # get ALL model parameters and their names
     all_named_parameters = dict(model.named_parameters(remove_duplicate=False))
 
     # get ONLY unique named parameters,
@@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
     for tied_param_name in tied_param_names:
         tied_param = all_named_parameters[tied_param_name]
         for param_name, param in no_duplicate_named_parameters.items():
-            # compare if parameters are the same, if so, group thier names together
+            # compare if parameters are the same, if so, group their names together
             if param is tied_param:
                 if param_name not in tied_param_groups:
                     tied_param_groups[param_name] = []
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 7a5f1fd7976..a5bae1fe9b7 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -329,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
         This util function is designed to test exported models by simulating the generation process.
         It processes the input prompt tokens sequentially (no parallel prefill).
         This generate function is not intended to replace the original `generate` method, and the support
-        for leveraging the original `generate` is potentially planed!
+        for leveraging the original `generate` is potentially planned!
 
         Args:
             exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index 4ff154ee205..fb0f604bfef 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -28,7 +28,7 @@ def autoname_modules(model):
         module.name = name
 
 
-# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
+# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
 def name_to_linear_tag(name):
     return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
 
@@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
     """
     Prepares nn.Linear layers for HQQ quantization.
     Since each layer type can have separate quantization parameters, we need to do the following:
-    1- tag each module with its neme via autoname_modules()
+    1- tag each module with its name via autoname_modules()
     2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
-    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
+    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
     """
 
     modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index f73fcdea5a6..00414afadb9 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -160,7 +160,7 @@ def distribute_module(
     output_fn=None,
 ) -> nn.Module:
     """
-    Copy pasted from torch's function but we remove the communications (partitionning)
+    Copy pasted from torch's function but we remove the communications (partitioning)
     as well as buffer registering that is similarly not efficient.
     """
     if len(module._forward_pre_hooks) == 0:
@@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):
 
     @staticmethod
     def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-        # this op cannot be asynch, otherwise it completely breaks the outputs of models
+        # this op cannot be async, otherwise it completely breaks the outputs of models
         torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
         return outputs
 
diff --git a/src/transformers/loss/loss_for_object_detection.py b/src/transformers/loss/loss_for_object_detection.py
index b820f6daed1..0481312af8a 100644
--- a/src/transformers/loss/loss_for_object_detection.py
+++ b/src/transformers/loss/loss_for_object_detection.py
@@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):
 
         # Compute the classification cost. Contrary to the loss, we don't use the NLL,
         # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
         class_cost = -out_prob[:, target_ids]
 
         # Compute the L1 cost between boxes
diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py
index a65d06de108..228a2aa3c76 100644
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
         target_bbox = torch.cat([v["boxes"] for v in targets])
         # Compute the classification cost. Contrary to the loss, we don't use the NLL,
         # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
         if self.use_focal_loss:
             out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
             out_prob = out_prob[:, target_ids]
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index a007b7a7c6d..b761615ac2e 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -593,7 +593,7 @@ class AlignVisionBlock(nn.Module):
 
 class AlignVisionEncoder(nn.Module):
     r"""
-    Forward propogates the embeddings through each vision encoder (EfficientNet) block.
+    Forward propagates the embeddings through each vision encoder (EfficientNet) block.
 
     Args:
         config ([`AlignVisionConfig`]):
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 628f05b2e01..ff0db00d766 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -36,7 +36,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
 class AlignProcessor(ProcessorMixin):
     r"""
     Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
-    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
+    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
     tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
     information.
     The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index eb32013d5e7..8a4e9cd8e4d 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -1936,7 +1936,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
         params = None
         if future_values is not None:
             # outputs.last_hidden_state and trend
-            # loc is 4rd last and scale is 3rd last output
+            # loc is 4th last and scale is 3rd last output
             params = self.output_params(outputs[0] + outputs[1])
             distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
 
diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
index f39ed47a19b..a7832a53d55 100644
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
@@ -164,7 +164,7 @@ def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pyt
     new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
     print(new_model.eval())
 
-    print("Model conversion was done sucessfully!")
+    print("Model conversion was done successfully!")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index db55e39ab73..c4ce602548d 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -235,7 +235,7 @@ class Blip2Config(PretrainedConfig):
         num_query_tokens (`int`, *optional*, defaults to 32):
             The number of query tokens passed through the Transformer.
         image_text_hidden_size (`int`, *optional*, defaults to 256):
-            Dimentionality of the hidden state of the image-text fusion layer.
+            Dimensionality of the hidden state of the image-text fusion layer.
 
         image_token_index (`int`, *optional*):
             Token index of special image token.
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index a7293bf5415..650ef15fb72 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -899,7 +899,7 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
         use_cache=True,
         **kwargs,
     ):
-        # Overwriten because of the fixed-shape attention mask creation
+        # Overwritten because of the fixed-shape attention mask creation
 
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index cbe51cab729..7f9c3f60015 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -49,17 +49,17 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
         hop_length (`int`,*optional*, defaults to 480):
-            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
             in smaller `frames` with a step of `hop_length` between each frame.
         max_length_s (`int`, *optional*, defaults to 10):
             The maximum input length of the model in seconds. This is used to pad the audio.
         fft_window_size (`int`, *optional*, defaults to 1024):
             Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
-            resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
+            resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the attention masks coresponding to the input.
+            Whether or not the model should return the attention masks corresponding to the input.
         frequency_min (`float`, *optional*, defaults to 0):
             The lowest frequency of interest. The STFT will not be computed for values below this.
         frequency_max (`float`, *optional*, defaults to 14000):
@@ -141,7 +141,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
         Serializes this instance to a Python dictionary.
 
         Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
             mel filter banks, which do not need to be saved or printed as they are too long.
         """
         output = copy.deepcopy(self.__dict__)
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 4f89deed499..1988bb95c6d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1067,7 +1067,7 @@ CLAP_TEXT_INPUTS_DOCSTRING = r"""
 CLAP_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
             retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
         is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
             Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
@@ -1105,7 +1105,7 @@ CLAP_INPUTS_DOCSTRING = r"""
 
             [What are position IDs?](../glossary#position-ids)
         input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
+            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
             retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index 6ca74ff5326..1c6226c458b 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -127,7 +127,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
         if kwargs.pop("add_bos_token", False):
             model_id = kwargs.pop("name_or_path", "")
             raise ValueError(
-                "Currenty GPT2's fast tokenizer does NOT support adding a BOS token. "
+                "Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
                 "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
                 f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
                 f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
index d39777680b1..85a1d4b64ac 100644
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -277,7 +277,7 @@ def final():
 
 def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
     """
-    Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
+    Function to convert the microsoft cvt checkpoint to huggingface checkpoint
     """
     img_labels_file = "imagenet-1k-id2label.json"
     num_labels = 1000
diff --git a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
index ae3a6771015..32746a38dd0 100644
--- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -58,7 +58,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     # activation function weight
     r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
     #########################################################################################################################################
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
+    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activation function weight
     r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
     r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
     # FFNs
@@ -144,7 +144,7 @@ def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_p
     config.label2id = {v: k for k, v in id2label.items()}
     # load original model from local path
     loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"), weights_only=True)["model"]
-    # Renaming the original model state dictionary to HF compatibile
+    # Renaming the original model state dictionary to HF compatible
     all_keys = list(loaded.keys())
     new_keys = convert_old_keys_to_new_keys(all_keys)
     state_dict = {}
diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
index 3bff1d83baf..f1a6ef7093b 100755
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -1297,7 +1297,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
     ):
         """
         Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
-        set fro each configuration.
+        set for each configuration.
 
         Args:
             config (`JukeboxPriorConfig`):
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index d1aa64d2f60..dbbc43502dd 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -142,7 +142,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch
         return patches
 
     if n_patches_per_batch < 4:
-        # for each batch, atleast 4 small patches are required to
+        # for each batch, at least 4 small patches are required to
         # recreate a large square patch from merging them and later padding is applied
         # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
         # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index b78050a01ae..380543c092a 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -118,7 +118,7 @@ class Embeddings(nn.Module):
 
         # Setting the position-ids to the registered buffer in constructor, it helps
         # when tracing the model without passing position-ids, solves
-        # isues similar to issue #5664
+        # issues similar to issue #5664
         if hasattr(self, "position_ids"):
             position_ids = self.position_ids[:, :seq_length]
         else:
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 72d051859a7..667c7ab3f6c 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -72,7 +72,7 @@ class DonutImageProcessor(BaseImageProcessor):
             Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
         do_pad (`bool`, *optional*, defaults to `True`):
             Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a
-            random amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+            random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
             padded to the largest image size in the batch.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
@@ -349,7 +349,7 @@ class DonutImageProcessor(BaseImageProcessor):
                 Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
             do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                 Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random
-                amont of padding on each size, up to the largest image size in the batch. Otherwise, all images are
+                amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are
                 padded to the largest image size in the batch.
             random_padding (`bool`, *optional*, defaults to `self.random_padding`):
                 Whether to use random padding when padding the image. If `True`, each image in the batch with be padded
diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
index e9a061ffd37..2c89feea43e 100644
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -142,7 +142,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig):
             speaker id embedding layer.
         num_languages (`int`, *optional*):
             Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the
-            languge id embedding layer.
+            language id embedding layer.
         speaker_embed_dim (`int`, *optional*):
             Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input.
         is_encoder_decoder (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
index 590786b195d..a9faf1d03e7 100644
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -391,7 +391,7 @@ class FastSpeech2ConformerVariancePredictor(nn.Module):
         dropout_rate=0.5,
     ):
         """
-        Initilize variance predictor module.
+        Initialize variance predictor module.
 
         Args:
             input_dim (`int`): Input dimension.
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index b5a5ea793ae..33a312bdee0 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -948,7 +948,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
 
         # Setting the position-ids to the registered buffer in constructor, it helps
         # when tracing the model without passing position-ids, solves
-        # isues similar to issue #5664
+        # issues similar to issue #5664
         if position_ids is None:
             if hasattr(self, "position_ids"):
                 position_ids = self.position_ids[:, :slen]
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index 41336c1b53a..3b9ec55dc86 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -360,7 +360,7 @@ class FocalNetModulation(nn.Module):
         x = self.projection_in(hidden_state).permute(0, 3, 1, 2).contiguous()
         q, ctx, gates = torch.split(x, (num_channels, num_channels, self.focal_level + 1), 1)
 
-        # context aggreation
+        # context aggregation
         ctx_all = 0
         for level in range(self.focal_level):
             ctx = self.focal_layers[level](ctx)
@@ -379,7 +379,7 @@ class FocalNetModulation(nn.Module):
         if self.use_post_layernorm_in_modulation:
             x_out = self.layernorm(x_out)
 
-        # post linear porjection
+        # post linear projection
         x_out = self.projection_out(x_out)
         x_out = self.projection_dropout(x_out)
         return x_out
@@ -415,7 +415,7 @@ class FocalNetLayer(nn.Module):
         dim (`int`):
             Number of input channels.
         input_resolution (`Tuple[int]`):
-            Input resulotion.
+            Input resolution.
         drop_path (`float`, *optional*, defaults to 0.0):
             Stochastic depth rate.
     """
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index a87f769a255..960b69ed31b 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -244,7 +244,7 @@ def _tokenize_prompts_with_image_and_batch(
     - pad all the sequences to this length so we can convert them into a 3D tensor.
     """
 
-    # If not tool use, tranform the coordinates while tokenizing
+    # If not tool use, transform the coordinates while tokenizing
     if scale_factors is not None:
         transformed_prompt_tokens = []
         for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py
index 6c0e4b9d809..a1680b7f5aa 100644
--- a/src/transformers/models/gemma3/configuration_gemma3.py
+++ b/src/transformers/models/gemma3/configuration_gemma3.py
@@ -96,7 +96,7 @@ class Gemma3TextConfig(PretrainedConfig):
             Scaling factor when applying tanh softcapping on the attention scores.
         cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
index f9156ab1b60..a138acef382 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -140,7 +140,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
     ):
         """
         Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
-        minumum allowed ratio.
+        minimum allowed ratio.
 
         Args:
             image (`np.ndarray`):
diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
index f86dbd3c756..7a18d456f35 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -108,7 +108,7 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
     ):
         """
         Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
-        minumum allowed ratio.
+        minimum allowed ratio.
 
         Args:
             image (`torch.Tensor`):
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
index 0e009d8887b..892b8898b62 100644
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -1270,7 +1270,7 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
 
         is_training = token_type_ids is not None and labels is not None
 
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
         if input_ids is not None and self.config.image_token_id >= self.vocab_size:
             special_image_mask = input_ids == self.config.image_token_id
             llm_input_ids = input_ids.clone()
diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py
index b4e7301964f..ecac4921d2e 100644
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@@ -128,7 +128,7 @@ class Gemma3TextConfig(Gemma2Config):
             Scaling factor when applying tanh softcapping on the attention scores.
         cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
@@ -926,7 +926,7 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
 
         is_training = token_type_ids is not None and labels is not None
 
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
         if input_ids is not None and self.config.image_token_id >= self.vocab_size:
             special_image_mask = input_ids == self.config.image_token_id
             llm_input_ids = input_ids.clone()
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 047b3a1fc43..bf5934c644a 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -1495,7 +1495,7 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
         >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
         >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
 
-        >>> # set seed for reproducability
+        >>> # set seed for reproducibility
         >>> np.random.seed(45)
 
 
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index cea854eabb9..c7855c07ea3 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -199,7 +199,7 @@ class GPTNeoXConfig(PretrainedConfig):
 
         if self.hidden_size % self.num_attention_heads != 0:
             raise ValueError(
-                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
+                "The hidden size is not divisible by the number of attention heads! Make sure to update them!"
             )
 
 
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
index ac8e82bfd82..84998cfdefa 100644
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
@@ -402,7 +402,7 @@ def convert_grounding_dino_checkpoint(args):
         "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
         "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
     }
-    # Define default GroundingDino configuation
+    # Define default GroundingDino configuration
     config = get_grounding_dino_config(model_name)
 
     # Load original checkpoint
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 6d2a2a8df23..755cfaf5d99 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1850,7 +1850,7 @@ class GroundingDinoDecoder(GroundingDinoPreTrainedModel):
 
             # In original implementation they apply layer norm before outputting intermediate hidden states
             # Though that's not through between layers so the layers use as input the output of the previous layer
-            # withtout layer norm
+            # without layer norm
             if output_hidden_states:
                 all_hidden_states += (self.layer_norm(hidden_states),)
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 3550f639586..8664db8a42f 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1425,7 +1425,7 @@ HUBERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare TFHubert Model transformer outputting raw hidden-states without any specific head on top.",
     HUBERT_START_DOCSTRING,
 )
 class TFHubertModel(TFHubertPreTrainedModel):
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index 6ddc344b9e1..f56d13efc91 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -74,8 +74,8 @@ class IBertConfig(PretrainedConfig):
         quant_mode (`bool`, *optional*, defaults to `False`):
             Whether to quantize the model or not.
         force_dequant (`str`, *optional*, defaults to `"none"`):
-            Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
-            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As deafult, it is set as
+            Force dequantize specific nonlinear layer. Dequantized layers are then executed with full precision.
+            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As default, it is set as
             `"none"`, which does not dequantize any layers. Please specify `"gelu"`, `"softmax"`, or `"layernorm"` to
             dequantize GELU, Softmax, or LayerNorm, respectively. `"nonlinear"` will dequantize all nonlinear layers,
             i.e., GELU, Softmax, and LayerNorm.
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index 2bb6ed2d97d..ca8b304be63 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -276,7 +276,7 @@ class InternVLProcessor(ProcessorMixin):
 
         Args:
             metadata (`VideoMetadata`):
-                `VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
+                `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
             num_frames (`int`, *optional*):
                 Number of frames to sample uniformly. If None, all frames are sampled.
             initial_shift (`bool`, `float` or `int`, defaults to `0`):
diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py
index 9924ac25dea..ff09660d626 100644
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@@ -246,7 +246,7 @@ class LevitAttentionSubsample(nn.Module):
         self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
         self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
         self.resolution_out = resolution_out
-        # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
         self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
         self.queries_subsample = LevitSubsample(stride, resolution_in)
         self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
@@ -370,7 +370,7 @@ class LevitStage(nn.Module):
         self.layers = []
         self.config = config
         self.resolution_in = resolution_in
-        # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
+        # resolution_in is the initial resolution, resolution_out is final resolution after downsampling
         for _ in range(depths):
             self.layers.append(
                 LevitResidualLayer(
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
index 6935ba798f7..fd9d68fb7b5 100644
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -55,7 +55,7 @@ if is_torchvision_available():
 
 def get_factors(dividend: int) -> Set[int]:
     """
-    Calculate all factors of a given number, i.e. a dividor that leaves
+    Calculate all factors of a given number, i.e. a divisor that leaves
     no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
 
     Args:
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 139852324a5..bedba000c7f 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -60,7 +60,7 @@ class LlavaNextVideoImageProcessor(BaseImageProcessor):
         image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
             A list of possible resolutions to use for processing high resolution images. The best resolution is selected
             based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-            method. Not used for processinf videos.
+            method. Not used for processing videos.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index bdd48bc8f36..107267bb516 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -405,7 +405,7 @@ class Mask2FormerHungarianMatcher(nn.Module):
         """
         super().__init__()
         if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
-            raise ValueError("All costs cant be 0")
+            raise ValueError("All costs can't be 0")
 
         self.num_points = num_points
         self.cost_class = cost_class
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index e354c21c401..60016172d25 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -829,7 +829,7 @@ class MaskFormerHungarianMatcher(nn.Module):
         """
         super().__init__()
         if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
-            raise ValueError("All costs cant be 0")
+            raise ValueError("All costs can't be 0")
         self.cost_class = cost_class
         self.cost_mask = cost_mask
         self.cost_dice = cost_dice
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index 68e2404475e..2e50a3b6b64 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -98,7 +98,7 @@ def add_megatron_checkpoint_args(parser):
         default=128,
         help=(
             "Pad the vocab size to be divisible by this value. "
-            "This is added for computational efficieny reasons. "
+            "This is added for computational efficiency reasons. "
             "Only used when converting a Transformers checkpoint to a Megatron checkpoint."
         ),
     )
@@ -235,7 +235,7 @@ def transformers_to_megatron_fix_query_key_value_ordering(
     param, checkpoint_version, num_splits, num_heads, hidden_size
 ):
     """
-    Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
+    Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM checkpoint versions. Input
     is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
     1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
     self-attention block, the param needs to be already transposed before calling this function.
@@ -348,7 +348,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
         raise ValueError(
             "Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints"
             " containing all the megatron arguments. This is because it loads all config related to model"
-            " architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to"
+            " architecture, the tensor and pipeline model parallel size from the checkpoint instead of user having to"
             " manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron"
             " arguments to use this utility."
         )
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 151c4e89f3b..bea8d9c637b 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1601,7 +1601,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
         # 7. determine generation mode
         generation_mode = generation_config.get_generation_mode()
 
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
             logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
             generation_config.guidance_scale = None
@@ -2617,7 +2617,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin):
         # 7. determine generation mode
         generation_mode = generation_config.get_generation_mode()
 
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
             logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
             generation_config.guidance_scale = None
diff --git a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
index d823adf649d..f0a7e4dee03 100644
--- a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
@@ -54,7 +54,7 @@ class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
         sampling_rate (`int`, *optional*, defaults to 32000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         hop_length (`int`, *optional*, defaults to 4096):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, *optional*, defaults to 30):
             The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
             sequences.
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 9170f947fed..a3cc95690d1 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -92,7 +92,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-            Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
+            Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
             Used as a conditional signal.
     """
 
@@ -757,8 +757,8 @@ MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-            Sequence of conditional hidden-states representing the concatenation of the projeted text encoder output and the projeted audio encoder output.
-            Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
+            Sequence of conditional hidden-states representing the concatenation of the projected text encoder output and the projected audio encoder output.
+            Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `input_ids` indices into associated vectors
@@ -818,7 +818,7 @@ MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
             [What are attention masks?](../glossary#attention-mask)
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states representing the concatenation of the text encoder output and the processed audio encoder output.
-            Used as a conditional signal and will thus be concatenated to the projeted `decoder_input_ids`.
+            Used as a conditional signal and will thus be concatenated to the projected `decoder_input_ids`.
         encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
             Mask to avoid performing attention on conditional hidden states. Mask values
             selected in `[0, 1]`:
@@ -1522,7 +1522,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
         # 7. determine generation mode
         generation_mode = generation_config.get_generation_mode()
 
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
             logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
             generation_config.guidance_scale = None
@@ -2478,7 +2478,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
         # 7. determine generation mode
         generation_mode = generation_config.get_generation_mode()
 
-        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
             logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
             generation_config.guidance_scale = None
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 8a2e735f9fe..e8b80221c44 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -425,7 +425,7 @@ class NllbMoeSparseMLP(nn.Module):
         r"""
         The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
         (mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
-        top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
+        top_2 gating mechanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
         instead of O(num_experts x batch_size x sequence_length x hidden_dim).
 
         1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py
index e5dc6ed1645..a4e94ab3f02 100644
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@@ -376,7 +376,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
             contains everything needed to load the tokenizer.
 
         clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
-            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
             spaces.
 
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
diff --git a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
index 2e515e98340..6ee52db488e 100644
--- a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
+++ b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
@@ -268,7 +268,7 @@ def convert_omdet_turbo_checkpoint(args):
             "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
         ],
     }
-    # Define default OmDetTurbo configuation
+    # Define default OmDetTurbo configuration
     config = get_omdet_turbo_config(model_name, use_timm_backbone)
 
     # Load original checkpoint
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 39b63c4c406..924e3b1dcad 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -471,7 +471,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
 
         is_training = token_type_ids is not None and labels is not None
 
-        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
         if input_ids is not None and self.config.image_token_id >= self.vocab_size:
             special_image_mask = input_ids == self.config.image_token_id
             llm_input_ids = input_ids.clone()
diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
index ca88a84a399..2238426cd08 100644
--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@@ -1807,7 +1807,7 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
 
     Args:
         prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
-            Prediction output from the classfication head.
+            Prediction output from the classification head.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
             Backbone embeddings before passing through the head.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*):
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index bd450698937..b7c7a8fd552 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -1487,7 +1487,7 @@ FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
 
     Summarization example:
 
-    ```pyton
+    ```python
     >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
 
     >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 074a8d1076a..8579cf08afa 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -127,7 +127,7 @@ def get_resize_output_image_size(
     ratio = max(height / max_height, width / max_width)
 
     if ratio > 1:
-        # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+        # Original implementation uses `round` which utilises bankers rounding, which can lead to surprising results
         # Here we use floor to ensure the image is always smaller than the given "longest_edge"
         height = int(math.floor(height / ratio))
         width = int(math.floor(width / ratio))
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
index 633d759123f..184a7c2a37a 100644
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@@ -35,7 +35,7 @@ logger = logging.get_logger(__name__)
 def create_rename_keys(config):
     rename_keys = []
     for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
+        # Rename embeddings' parameters
         rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
 
         rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 4a21ee48d39..4ec2368ab2a 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -1037,7 +1037,7 @@ REMBERT_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RemBERT Model transformer outputting raw hidden-states without any specific head on top.",
     REMBERT_START_DOCSTRING,
 )
 class TFRemBertModel(TFRemBertPreTrainedModel):
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 738f8e67e9b..6f697ad14e0 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -911,7 +911,7 @@ ROFORMER_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
     ROFORMER_START_DOCSTRING,
 )
 class TFRoFormerModel(TFRoFormerPreTrainedModel):
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index 8e179463d10..d1eda956e07 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -2171,7 +2171,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
         config: SeamlessM4TConfig,
         embed_tokens_decoder: Optional[nn.Embedding] = None,
     ):
-        # update config - used principaly for bos_token_id etc.
+        # update config - used principality for bos_token_id etc.
         config = copy.deepcopy(config)
         for param, val in config.to_dict().items():
             if param.startswith("t2u_"):
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 9f7482b8a22..5895a85ace5 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -184,7 +184,7 @@ SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
 
             [What are input IDs?](../glossary#input-ids)
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
             [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
     """
 
@@ -202,7 +202,7 @@ M4T_TEXT_INPUTS_DOCSTRING = r"""
 M4T_SPEECH_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
             [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
         """
 
@@ -2461,7 +2461,7 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod
         config: SeamlessM4Tv2Config,
         embed_tokens_decoder: Optional[nn.Embedding] = None,
     ):
-        # update config - used principaly for bos_token_id etc.
+        # update config - used principality for bos_token_id etc.
         config = copy.deepcopy(config)
         for param, val in config.to_dict().items():
             if param.startswith("t2u_"):
@@ -4035,7 +4035,7 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix
 
         Args:
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
             return_intermediate_token_ids (`bool`, *optional*):
                 If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
@@ -4485,7 +4485,7 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
 
                 [What are input IDs?](../glossary#input-ids)
             input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
-                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
                 [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
             return_intermediate_token_ids (`bool`, *optional*):
                 If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
diff --git a/src/transformers/models/seggpt/convert_seggpt_to_hf.py b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
index d67daeab93d..79aa6f59832 100644
--- a/src/transformers/models/seggpt/convert_seggpt_to_hf.py
+++ b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
@@ -114,7 +114,7 @@ def convert_seggpt_checkpoint(args):
     verify_logits = args.verify_logits
     push_to_hub = args.push_to_hub
 
-    # Define default GroundingDINO configuation
+    # Define default GroundingDINO configuration
     config = SegGptConfig()
 
     # Load original checkpoint
diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py
index e4058b33467..0150f2d7a93 100644
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@@ -62,7 +62,7 @@ class SegGptEncoderOutput(ModelOutput):
         intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
             Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
             Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
-            Additionaly, each feature passes through a LayerNorm.
+            Additionally, each feature passes through a LayerNorm.
     """
 
     last_hidden_state: torch.FloatTensor
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index a46ab875d42..62e136b3575 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -1979,10 +1979,10 @@ SPEECHT5_BASE_START_DOCSTRING = r"""
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
-            The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
+            The Transformer encoder module that applies the appropriate speech or text encoder prenet. If `None`,
             [`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
         decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
-            The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
+            The Transformer decoder module that applies the appropriate speech or text decoder prenet. If `None`,
             [`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
             states.
 """
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
index 5e7d9ef33d3..df70b4576d7 100755
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
@@ -175,7 +175,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
 
 
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
-    """Replaces the params in model witht the T5X converted params."""
+    """Replaces the params in model with the T5X converted params."""
     variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
     converted = convert_t5x_to_pytorch(
         variables,
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 7e5abdd7fab..4672260c169 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -2344,11 +2344,11 @@ def _calculate_expected_result(
     if avg_approximation == AverageApproximationFunction.RATIO:
         average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
     elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
-        # The sum of all probabilities exept that correspond to other cells
+        # The sum of all probabilities except that correspond to other cells
         ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
         average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
     elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
-        # The sum of all probabilities exept that correspond to other cells
+        # The sum of all probabilities except that correspond to other cells
         ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
         pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
         var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index c6a9fac40ca..a23868c6dde 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -2359,7 +2359,7 @@ _ORDINAL_WORDS = [
     "second",
     "third",
     "fourth",
-    "fith",
+    "fifth",
     "sixth",
     "seventh",
     "eighth",
diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 665eedc8106..b2c43815f59 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -1364,7 +1364,7 @@ class UdopStack(UdopPreTrainedModel):
 
         if inputs_embeds is None:
             if self.embed_tokens is None:
-                raise ValueError("You have to intialize the model with valid token embeddings")
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         if pixel_values is not None:
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
index 848ca3c5660..aaa8c94e2fd 100644
--- a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+++ b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
@@ -200,7 +200,7 @@ def make_state_dict(converted_params, is_encoder_only: bool):
 
 
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
-    """Replaces the params in model witht the T5X converted params."""
+    """Replaces the params in model with the T5X converted params."""
     variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
     converted = convert_t5x_to_pytorch(
         variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index 6a5d2ea4013..8ddf93edbe0 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -164,7 +164,7 @@ class UniSpeechConfig(PretrainedConfig):
         eos_token_id (`int`, *optional*, defaults to 2):
             The id of the "end-of-sequence" token.
         replace_prob (`float`, *optional*, defaults to 0.5):
-            Propability that transformer feature is replaced by quantized feature for pretraining.
+            Probability that transformer feature is replaced by quantized feature for pretraining.
 
     Example:
 
diff --git a/src/transformers/models/univnet/convert_univnet.py b/src/transformers/models/univnet/convert_univnet.py
index f790efab22f..d58ff6bd650 100644
--- a/src/transformers/models/univnet/convert_univnet.py
+++ b/src/transformers/models/univnet/convert_univnet.py
@@ -56,7 +56,7 @@ def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = ""
 def get_key_mapping(config: UnivNetConfig):
     mapping = {}
 
-    # NOTE: inital conv layer keys are the same
+    # NOTE: initial conv layer keys are the same
 
     # LVC Residual blocks
     for i in range(len(config.resblock_stride_sizes)):
diff --git a/src/transformers/models/univnet/feature_extraction_univnet.py b/src/transformers/models/univnet/feature_extraction_univnet.py
index 5f43532d9df..5c657636239 100644
--- a/src/transformers/models/univnet/feature_extraction_univnet.py
+++ b/src/transformers/models/univnet/feature_extraction_univnet.py
@@ -64,7 +64,7 @@ class UnivNetFeatureExtractor(SequenceFeatureExtractor):
             The number of FFT components to use. If `None`, this is determined using
             `transformers.audio_utils.optimal_fft_length`.
         max_length_s (`int`, *optional*, defaults to 10):
-            The maximum input lenght of the model in seconds. This is used to pad the audio.
+            The maximum input length of the model in seconds. This is used to pad the audio.
         fmin (`float`, *optional*, defaults to 0.0):
             Minimum mel frequency in Hz.
         fmax (`float`, *optional*):
diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py
index cc6d727f598..baa30704f78 100644
--- a/src/transformers/models/vilt/configuration_vilt.py
+++ b/src/transformers/models/vilt/configuration_vilt.py
@@ -39,7 +39,7 @@ class ViltConfig(PretrainedConfig):
             The vocabulary size of the `token_type_ids` passed when calling [`ViltModel`]. This is used when encoding
             text.
         modality_type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatening the
+            The vocabulary size of the modalities passed when calling [`ViltModel`]. This is used after concatenating the
             embeddings of the text and image modalities.
         max_position_embeddings (`int`, *optional*, defaults to 40):
             The maximum sequence length that this model might ever be used with.
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 1aa529c26dc..5aa9e1e1cf1 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -139,7 +139,7 @@ class ViltEmbeddings(nn.Module):
         x_mask = x_mask.flatten(1)
 
         if max_image_length < 0 or max_image_length is None or not isinstance(max_image_length, int):
-            # suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrinked)
+            # suppose aug is 800 x 1333, then, maximum effective res is 800 x 1333 (if one side gets bigger, the other will be constrained and be shrunk)
             # (800 // self.patch_size) * (1333 // self.patch_size) is the maximum number of patches that single image can get.
             # if self.patch_size = 32, 25 * 41 = 1025
             # if res is 384 x 640, 12 * 20 = 240
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index 09b324af247..941dfee11da 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -85,7 +85,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
         super().__init__(**kwargs)
         if "encoder" not in kwargs or "decoder" not in kwargs:
             raise ValueError(
-                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"A configuration of type {self.model_type} cannot be instantiated because "
                 f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
             )
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index c385c192a98..1e71728de5b 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1496,7 +1496,7 @@ WAV2VEC2_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(
-    "The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare TFWav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
     WAV2VEC2_START_DOCSTRING,
 )
 class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index 63cd44e12b0..11eb1f081af 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -101,7 +101,7 @@ class WavLMConfig(PretrainedConfig):
             [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
             Recognition](https://arxiv.org/abs/1904.08779).
         mask_time_prob (`float`, *optional*, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
             masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
             along the time axis. This is only relevant if `apply_spec_augment is True`.
         mask_time_length (`int`, *optional*, defaults to 10):
@@ -111,7 +111,7 @@ class WavLMConfig(PretrainedConfig):
             irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
             mask_time_min_masks''
         mask_feature_prob (`float`, *optional*, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
             be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
             along the time axis. This is only relevant if `apply_spec_augment is True`.
         mask_feature_length (`int`, *optional*, defaults to 10):
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index 9d87898235b..f839548320f 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -474,7 +474,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
             outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
                 Raw outputs of the model from flipped input (averaged out in the end).
             do_remove_padding (`bool`, *optional*):
-                By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
+                By default ZoeDepth adds padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
                 boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
                 parameter exists here in case the user changed the image preprocessing to not include padding.
 
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
index 8e63e2f5bf6..49837d53128 100644
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@@ -37,7 +37,7 @@ class AwqQuantizer(HfQuantizer):
     4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
     """
 
-    # AWQ requires data callibration - we support only inference
+    # AWQ requires data calibration - we support only inference
     requires_calibration = True
 
     required_packages = ["awq", "accelerate"]
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 4e45abf9534..7aa120f00c6 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -69,7 +69,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
             return missing_keys
 
         # We expect some keys to be missing for
-        # compresed models
+        # compressed models
         # This is fine as the weights are reconstructed by ModelCompressor
         # in _process_model_after_weight_loading
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 905781fa81a..f420a3e12d7 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1728,14 +1728,14 @@ Please note that you may need to restart your runtime after installation.
 
 # docstyle-ignore
 LIBROSA_IMPORT_ERROR = """
-{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
+{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
 `pip install librosa`
 Please note that you may need to restart your runtime after installation.
 """
 
 # docstyle-ignore
 PRETTY_MIDI_IMPORT_ERROR = """
-{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
+{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
 `pip install pretty_midi`
 Please note that you may need to restart your runtime after installation.
 """
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 155fcd76819..bc560f68171 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1120,7 +1120,7 @@ class VptqLayerConfig(QuantizationConfigMixin):
         group_size (`int`, *optional*, defaults to `-1`): depends on out-features
         indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
         is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
-        num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
+        num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centroid numbers of clusters
         num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
         outlier_size (`int`, *optional*, defaults to `1`): outliers
         vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py
index d1657bb7d45..948ca66e3f8 100644
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@@ -146,7 +146,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
         )
         original_tokenizer_encodings = [
             871,
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index e1e170df117..ef53889fa1d 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -170,7 +170,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
         )
         original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66]  # fmt: skip
         self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index f74b6472782..c59e7f4e148 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -438,7 +438,7 @@ class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
         if self.has_attentions:
             self.assertIsNotNone(attentions.grad)
 
-    # override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
+    # override as the `logit_scale` parameter initialization is different for BRIDGE TOWER
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/bridgetower/test_processor_bridgetower.py b/tests/models/bridgetower/test_processor_bridgetower.py
index 4b3bb0716fa..60989eb91d9 100644
--- a/tests/models/bridgetower/test_processor_bridgetower.py
+++ b/tests/models/bridgetower/test_processor_bridgetower.py
@@ -55,7 +55,7 @@ class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdirname, ignore_errors=True)
 
-    # Some kwargs tests are overriden from common tests to handle shortest_edge
+    # Some kwargs tests are overridden from common tests to handle shortest_edge
     # and size_divisor behaviour
 
     @require_torch
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index acdac4dd870..322fa9d68f4 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -924,7 +924,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for FLAVA
+    # override as the `logit_scale` parameter initialization is different for FLAVA
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -933,7 +933,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale" or name == "flava.logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index 39b7abeaea8..9638ff4a87f 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -137,7 +137,7 @@ class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
 
     @unittest.skip(
         reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
-        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+        " as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
     )
     def test_multi_gpu_data_parallel_forward(self):
         pass
@@ -275,7 +275,7 @@ class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
 
     @unittest.skip(
         reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
-        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+        " as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
     )
     def test_multi_gpu_data_parallel_forward(self):
         pass
diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py
index 968a852d64e..30587a8f55e 100644
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -88,7 +88,7 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
         image = self.prepare_image_inputs()
 
-        # If text has no image tokens, iamge should be `None`
+        # If text has no image tokens, image should be `None`
         with self.assertRaises(ValueError):
             _ = processor(text=text_no_image, images=image, return_tensors="np")
 
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 9384ab2f094..fc552ab4afe 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -478,8 +478,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         # the last 2 tokens are masked, and should have 0 attn_probs
         self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
 
-        # in loacal attention each token can only attend to the previous window_size tokens (including itself)
-        # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
+        # in local attention each token can only attend to the previous window_size tokens (including itself)
+        # here window_size is 4, so a token at index 5 can only attend to indices [2, 3, 4, 5]
         # and the attn_probs should be 0 for token [0, 1]
         self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
         self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0))
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 19a5054c981..b55c590efb6 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -769,7 +769,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
         encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device)
         encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device)
         # If we batch the text and cross attention masking is working the batched result should be equal to
-        # The singe text result
+        # The single text result
         encoding_batched = processor(
             images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt"
         ).to(torch_device)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index c795de7a0b5..f47330ec673 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -658,7 +658,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
diff --git a/tests/models/llava/test_configuration_llava.py b/tests/models/llava/test_configuration_llava.py
index 3b28adc1ee6..9680b6e487d 100644
--- a/tests/models/llava/test_configuration_llava.py
+++ b/tests/models/llava/test_configuration_llava.py
@@ -56,7 +56,7 @@ class LlavaConfigTest(unittest.TestCase):
 
     def test_arbitrary_reload(self):
         """
-        Simple test for reloading arbirarily composed subconfigs
+        Simple test for reloading arbitrarily composed subconfigs
         """
         default_values = LlavaConfig().to_diff_dict()
         default_values["vision_config"]["model_type"] = "pixtral"
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 1072d9043ea..a692566340c 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -553,8 +553,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
         # image = Image.open(requests.get(url, stream=True).raw)
         inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device)
         generate_ids = model.generate(**inputs, max_new_tokens=500)
-        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        print(ouptut)
+        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        print(output)
 
         # fmt: off
         EXPECTED_GENERATION = """
@@ -573,7 +573,7 @@ These descriptions provide a detailed overview of the content and atmosphere of
 """
         # fmt: on
         # check that both inputs are handled correctly and generate the same output
-        self.assertEqual(ouptut, EXPECTED_GENERATION)
+        self.assertEqual(output, EXPECTED_GENERATION)
 
     @slow
     @require_bitsandbytes
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index da66a109acf..14a55879424 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -200,7 +200,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
+                # Rust correctly handles the space before the mask while python doesn't
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
                 self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py
index f5d823e3bce..0d98e7919e7 100644
--- a/tests/models/luke/test_tokenization_luke.py
+++ b/tests/models/luke/test_tokenization_luke.py
@@ -157,7 +157,7 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
 
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
+                # Rust correctly handles the space before the mask while python doesn't
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
                 self.assertSequenceEqual(
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 5e12d5ddc5e..faf074902dc 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -142,7 +142,7 @@ class MaskFormerModelTester:
 
             output = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
             output = model(pixel_values, output_hidden_states=True)
-        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
+        # the correct shape of output.transformer_decoder_hidden_states ensure the correctness of the
         # encoder and pixel decoder
         self.parent.assertEqual(
             output.transformer_decoder_last_hidden_state.shape,
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index 864c948d10d..e87a7ae7518 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -333,7 +333,7 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
             processor(text=text, images=None, padding=True)
 
     def test_unstructured_kwargs_batched(self):
-        # Overriden because Mllama expects images in nested format. For 2 images it can't infer
+        # Overridden because Mllama expects images in nested format. For 2 images it can't infer
         # the correct nesting, so we better throw an error
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py
index c9a6a44e297..9aa3ae19d00 100644
--- a/tests/models/mluke/test_tokenization_mluke.py
+++ b/tests/models/mluke/test_tokenization_mluke.py
@@ -120,7 +120,7 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
 
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
+                # Rust correctly handles the space before the mask while python doesn't
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
                 self.assertSequenceEqual(
diff --git a/tests/models/mt5/test_modeling_flax_mt5.py b/tests/models/mt5/test_modeling_flax_mt5.py
index 34a5731fd05..7b98d740b31 100644
--- a/tests/models/mt5/test_modeling_flax_mt5.py
+++ b/tests/models/mt5/test_modeling_flax_mt5.py
@@ -34,7 +34,7 @@ class MT5IntegrationTest(unittest.TestCase):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 7da319e1963..09c78b047ab 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -853,7 +853,7 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -1063,7 +1063,7 @@ class MT5EncoderOnlyModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -1100,7 +1100,7 @@ class MT5IntegrationTest(unittest.TestCase):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
diff --git a/tests/models/mt5/test_modeling_tf_mt5.py b/tests/models/mt5/test_modeling_tf_mt5.py
index 87249dc2a61..f7b77014d40 100644
--- a/tests/models/mt5/test_modeling_tf_mt5.py
+++ b/tests/models/mt5/test_modeling_tf_mt5.py
@@ -33,7 +33,7 @@ class TFMT5ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index af44cc961c5..e725cffb53f 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -182,7 +182,7 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
                 tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
+                # Rust correctly handles the space before the mask while python doesn't
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
                 self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index e79a96d8169..2eb3f2fbfc7 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -491,11 +491,11 @@ class NllbMoeRouterTest(unittest.TestCase):
         mask = mask.reshape(-1)
         set_seed(0)
         hidden_states = torch.rand((self.batch_size, self.sequence_length, self.config.hidden_size))
-        classfier = torch.nn.Linear(self.config.hidden_size, self.config.num_experts)
+        classifier = torch.nn.Linear(self.config.hidden_size, self.config.num_experts)
         hf_router = NllbMoeTop2Router(self.config)
 
         _, _, hidden_dim = hidden_states.shape
-        logits = classfier(hidden_states.reshape((self.batch_size * self.sequence_length), hidden_dim))
+        logits = classifier(hidden_states.reshape((self.batch_size * self.sequence_length), hidden_dim))
         top_1_mask, router_probs = hf_router.route_tokens(logits, padding_mask=mask)
         torch.argmax(top_1_mask, dim=-1)
         router_mask = router_probs.bool()
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index dee84a53f36..ea00be138db 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -329,7 +329,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
         pass
 
     @unittest.skip(
-        reason="VLMs doen't accept inputs embeds and pixel values at the same time. So if the test passed for bacbone LM, it passes for VLM also"
+        reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
     )
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py
index 6938e50a266..4dad8b75fbb 100644
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -326,7 +326,7 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
         pass
 
     @unittest.skip(
-        reason="VLMs doen't accept inputs embeds and pixel values at the same time. So if the test passed for bacbone LM, it passes for VLM also"
+        reason="VLMs doesn't accept inputs embeds and pixel values at the same time. So if the test passed for backbone LM, it passes for VLM also"
     )
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 1c1fda13f3b..8d18a163e8a 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -526,7 +526,7 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             loss = model(**inputs).loss
             loss.backward()
 
-    # override as the `logit_scale` parameter initilization is different for Pix2Struct
+    # override as the `logit_scale` parameter initialization is different for Pix2Struct
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -535,7 +535,7 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
index ad44f162c0f..caaeb78a2e9 100644
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -160,7 +160,7 @@ class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
             self.assertTrue(hasattr(image_processing, "image_std"))
             self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
 
-    # The following tests are overriden as PixtralImageProcessor can return images of different sizes
+    # The following tests are overridden as PixtralImageProcessor can return images of different sizes
     # and thus doesn't support returning batched tensors
 
     def test_call_pil(self):
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 5061c73ec7a..0a4a773faac 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -575,7 +575,7 @@ class Pop2PianoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index d6eead97cbb..b9632b21bbe 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -978,7 +978,7 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_causal_lm_from_pretrained(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index ecea6a34977..5a1e7615fff 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -368,7 +368,7 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
         torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
 
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
+        # We make sure that the loss of including padding tokens != the loss without padding tokens
         # if attention_mask=None --> we don't exclude padding tokens
         include_padding_result = model(padded_input_ids, attention_mask=None)
 
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 31cf74e7c97..e213ccd819b 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -226,7 +226,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             patch_size = config.vision_config.patch_size
diff --git a/tests/models/qwen3_moe/test_modeling_qwen3_moe.py b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
index c14f71407d9..af3cf160322 100644
--- a/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
+++ b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
@@ -362,7 +362,7 @@ class Qwen3MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
         torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
 
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
+        # We make sure that the loss of including padding tokens != the loss without padding tokens
         # if attention_mask=None --> we don't exclude padding tokens
         include_padding_result = model(padded_input_ids, attention_mask=None)
 
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index 7fce4ca3924..0189129e8c3 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -67,7 +67,7 @@ if is_torch_available() and is_datasets_available() and is_faiss_available():
 
 
 def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
-    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    """If tensors not close, or a and b aren't both tensors, raise a nice Assertion error."""
     if a is None and b is None:
         return True
     try:
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 22b93e3ffb3..0fe0290a384 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -198,7 +198,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
                 tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
 
-                # Rust correctly handles the space before the mask while python doesnt
+                # Rust correctly handles the space before the mask while python doesn't
                 self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
                 self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 38a25297302..cce221a2192 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -236,13 +236,13 @@ class SamVisionModelTest(ModelTesterMixin, unittest.TestCase):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -668,7 +668,7 @@ class SamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
diff --git a/tests/models/smolvlm/test_processor_smolvlm.py b/tests/models/smolvlm/test_processor_smolvlm.py
index 00644e399cf..a62ef3b743b 100644
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -375,7 +375,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
     @require_av
     def test_apply_chat_template_video_frame_sampling(self):
-        # overriden because SmolVLM has special preprocessing for videos
+        # overridden because SmolVLM has special preprocessing for videos
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index c608a2dbdd5..7858cd4b1b1 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -858,7 +858,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -1066,7 +1066,7 @@ class T5EncoderOnlyModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index 3e86c169bff..63dd76eef3e 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -319,7 +319,7 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 46a263cf799..c5ba7484aa6 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -484,7 +484,7 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -701,7 +701,7 @@ class UMT5EncoderOnlyModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -741,7 +741,7 @@ class Umt5IntegrationTest(unittest.TestCase):
     )
     def test_small_integration_test(self):
         """
-        For comparison run the kaggle notbook available here : https://www.kaggle.com/arthurzucker/umt5-inference
+        For comparison run the kaggle notebook available here : https://www.kaggle.com/arthurzucker/umt5-inference
         """
 
         model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small", return_dict=True).to(torch_device)
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 71c1f4dc224..91efea72f5f 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -200,7 +200,7 @@ class TFWhisperModelTester:
 
     def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = TFWhisperModel(config=config).get_decoder()
-        # take a slice so we're shorter than the seqeuence length and can append later
+        # take a slice so we're shorter than the sequence length and can append later
         input_ids = inputs_dict["decoder_input_ids"][:, :-10]
         attention_mask = inputs_dict["decoder_attention_mask"][:, :-10]
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 7d2a5e54bd5..9ec71d635ea 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -104,7 +104,7 @@ if is_torch_available():
             self.begin_index = begin_index
 
         def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-            # we don't want to randomely sample timestamp tokens
+            # we don't want to randomly sample timestamp tokens
             if input_ids.shape[-1] != self.begin_index:
                 scores[:, self.timestamp_begin :] = -float("inf")
 
@@ -562,7 +562,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     @parameterized.expand([("offloaded",)])
     @pytest.mark.generate
-    @unittest.skip(reason="Whisper doesnt work with offloaded cache implementation yet")
+    @unittest.skip(reason="Whisper doesn't work with offloaded cache implementation yet")
     def test_offloaded_cache_implementation(self, cache_implementation):
         pass
 
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index f7131d7e030..3444a58b32f 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -181,7 +181,7 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to unk, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to unk, such as saoneuhaoesuth"
         )
         original_tokenizer_encodings = [2, 1018, 67, 11, 1988, 2617, 5631, 278, 11, 3407, 48, 71630, 28085, 4, 3234, 157, 13, 6, 5, 6, 4, 3526, 768, 15, 659, 57, 298, 3983, 864, 129, 21, 6, 5, 13675, 377, 652, 7580, 10341, 155, 2817, 422, 1666, 7, 1674, 53, 113, 202277, 17892, 33, 60, 87, 4, 3234, 157, 61, 2667, 52376, 19, 88, 23, 735]  # fmt: skip
 
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index 0d5d8a43b6b..41f37f84686 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -258,7 +258,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
         )
         original_tokenizer_encodings = [
             0,
diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py
index 78079293f3a..a11ac6106cd 100644
--- a/tests/models/zamba2/test_modeling_zamba2.py
+++ b/tests/models/zamba2/test_modeling_zamba2.py
@@ -324,7 +324,7 @@ class Zamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
     def test_past_key_values_format(self):
         """
-        Overwritting to pass the expected cache shapes (Zamba2 has cache shape = [batch_size, 0] for mamba layers)
+        Overwriting to pass the expected cache shapes (Zamba2 has cache shape = [batch_size, 0] for mamba layers)
         """
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
         batch_size, seq_length = inputs["input_ids"].shape
diff --git a/tests/quantization/autoround/test_auto_round.py b/tests/quantization/autoround/test_auto_round.py
index fd77597a77d..692c0df43b2 100644
--- a/tests/quantization/autoround/test_auto_round.py
+++ b/tests/quantization/autoround/test_auto_round.py
@@ -152,7 +152,7 @@ class AutoRoundTest(unittest.TestCase):
 
     def test_convert_from_gptq(self):
         """
-        Simple test that checks if auto-round work properly wth gptq format
+        Simple test that checks if auto-round work properly with gptq format
         """
         model_name = "ybelkada/opt-125m-gptq-4bit"
 
@@ -170,7 +170,7 @@ class AutoRoundTest(unittest.TestCase):
     @require_intel_extension_for_pytorch
     def test_convert_from_awq_cpu(self):
         """
-        Simple test that checks if auto-round work properly wth awq format
+        Simple test that checks if auto-round work properly with awq format
         """
         model_name = "casperhansen/opt-125m-awq"
 
@@ -187,7 +187,7 @@ class AutoRoundTest(unittest.TestCase):
 
     def test_mixed_bits(self):
         """
-        Simple test that checks if auto-round work properly wth mixed bits
+        Simple test that checks if auto-round work properly with mixed bits
         """
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
diff --git a/tests/quantization/bnb/README.md b/tests/quantization/bnb/README.md
index 9b26fb60afa..de6e0083163 100644
--- a/tests/quantization/bnb/README.md
+++ b/tests/quantization/bnb/README.md
@@ -67,7 +67,7 @@ Same solution as above.
 
 ### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
 
-Run your script by pre-pending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
+Run your script by prepending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
 
 ### `CUDA illegal memory error: an illegal memory access at line...`:
 
diff --git a/tests/quantization/quark_integration/test_quark.py b/tests/quantization/quark_integration/test_quark.py
index 22d0eb5293a..cfd489b2bed 100644
--- a/tests/quantization/quark_integration/test_quark.py
+++ b/tests/quantization/quark_integration/test_quark.py
@@ -97,7 +97,7 @@ class QuarkTest(unittest.TestCase):
 
     def test_original_dtype(self):
         r"""
-        A simple test to check if the model succesfully stores the original dtype
+        A simple test to check if the model successfully stores the original dtype
         """
         self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
         self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index fca89147f42..f82dea2f4f8 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3429,7 +3429,7 @@ class ModelTesterMixin:
     def test_sdpa_can_dispatch_non_composite_models(self):
         """
         Tests if non-composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         """
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
@@ -3467,7 +3467,7 @@ class ModelTesterMixin:
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index 0ac880dc25b..f4ffc3cb117 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -185,7 +185,7 @@ class ConfigTestUtils(unittest.TestCase):
     def test_config_common_kwargs_is_complete(self):
         base_config = PretrainedConfig()
         missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
-        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
+        # If this part of the test fails, you have arguments to add in config_common_kwargs above.
         self.assertListEqual(
             missing_keys,
             [