diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index c9470eeeae8..b24beedcd4f 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -18,7 +18,7 @@
 #
 # --variations allows you to compare variations in multiple dimensions.
 #
-# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6
+# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
 # times adding one of:
 #
 #    1. --tf32 0 --fp16 0
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 003e635a108..85736a9422a 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -405,7 +405,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
         self.assertFalse(torch.allclose(good_deepspeed_sin_cos, bad_deepspeed_sin_cos))
         torch.testing.assert_close(good_torch_sin_cos, good_deepspeed_sin_cos.cpu())
 
-        # Finally, we can see that the incorrect pattern is okay on vanilla torch, demostrating that this issue is
+        # Finally, we can see that the incorrect pattern is okay on vanilla torch, demonstrating that this issue is
         # exclusive to DeepSpeed
         bad_torch_sin_cos = bad_deepspeed_create_sinusoidal_positions(
             model.config.max_position_embeddings, model.config.rotary_dim
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index ef30599581f..ef8010074b4 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -193,7 +193,7 @@ class GenerationConfigTest(unittest.TestCase):
             generation_config_bad_temperature.update(temperature=None)
         self.assertEqual(len(captured_warnings), 0)
 
-        # Impossible sets of contraints/parameters will raise an exception
+        # Impossible sets of constraints/parameters will raise an exception
         with self.assertRaises(ValueError):
             GenerationConfig(do_sample=False, num_beams=1, num_return_sequences=2)
         with self.assertRaises(ValueError):
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index a922a71c22c..7ba1502d429 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -751,7 +751,7 @@ class LogitsProcessorTest(unittest.TestCase):
         scores = self._get_uniform_logits(batch_size, vocab_size)
         processed_scores = logits_processor(input_ids, scores)
         self.assertTrue(torch.isneginf(processed_scores[:, bos_token_id + 1 :]).all())
-        # score for bos_token_id shold be zero
+        # score for bos_token_id should be zero
         self.assertListEqual(processed_scores[:, bos_token_id].tolist(), 4 * [0])
 
         # processor should not change logits in-place
@@ -972,7 +972,7 @@ class LogitsProcessorTest(unittest.TestCase):
 
         watermark = WatermarkLogitsProcessor(vocab_size=vocab_size, device=input_ids.device)
 
-        # use fixed id for last token, needed for reprodicibility and tests
+        # use fixed id for last token, needed for reproducibility and tests
         input_ids[:, -1] = 10
         scores_wo_bias = scores[:, -1].clone()
         out = watermark(input_ids=input_ids, scores=scores)
diff --git a/tests/generation/test_stopping_criteria.py b/tests/generation/test_stopping_criteria.py
index ace7d496dab..375c10c6724 100644
--- a/tests/generation/test_stopping_criteria.py
+++ b/tests/generation/test_stopping_criteria.py
@@ -256,7 +256,7 @@ class StoppingCriteriaTestCase(unittest.TestCase):
             ]
         )
 
-        # trigger stopping when at leat one criteria is satisfied, one value per batch
+        # trigger stopping when at least one criteria is satisfied, one value per batch
         self.assertTrue(criteria(inputs["input_ids"], scores))
 
         # return False when neither is satisfied
@@ -283,7 +283,7 @@ class StoppingCriteriaTestCase(unittest.TestCase):
             ]
         )
 
-        # trigger stopping when at leat one criteria is satisfied
+        # trigger stopping when at least one criteria is satisfied
         self.assertListEqual(criteria(inputs["input_ids"], scores).tolist(), [True, False, False])
 
         # False when neither is satisfied
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index e6cbe5267a7..daa9e2f70ca 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -173,7 +173,7 @@ class GenerationTesterMixin:
     def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5):
         """
         Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in
-        the following siturations:
+        the following situations:
         1. The sequences are the same
         2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical
         """
@@ -1617,7 +1617,7 @@ class GenerationTesterMixin:
             embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
             per_head_embed_dim = embed_dim // num_attention_heads
 
-            # some models have diffent num-head for query vs key/value so we need to assign correct value
+            # some models have different num-head for query vs key/value so we need to assign correct value
             # BUT only after `per_head_embed_dim` is set
             num_attention_heads = (
                 text_config.num_key_value_heads
@@ -2316,7 +2316,7 @@ class GenerationTesterMixin:
     def _test_attention_implementation(self, attn_implementation):
         """
         Compares the output of generate with the eager attention implementation against other implementations.
-        NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence
+        NOTE: despite the test logic being the same, different implementations actually need different decorators, hence
         this separate function.
         """
         max_new_tokens = 30
@@ -4619,7 +4619,7 @@ class GenerationIntegrationTests(unittest.TestCase):
         self.assertTrue(diff < 1e-4)
 
     def test_generate_input_ids_as_kwarg(self):
-        """Test that `input_ids` work equaly as a positional and keyword argument in decoder-only models"""
+        """Test that `input_ids` work equally as a positional and keyword argument in decoder-only models"""
         article = "I need input_ids to generate"
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
@@ -4636,7 +4636,7 @@ class GenerationIntegrationTests(unittest.TestCase):
         self.assertEqual(output_sequences.shape, (1, 15))
 
     def test_generate_input_ids_as_encoder_kwarg(self):
-        """Test that `input_ids` work equaly as a positional and keyword argument in encoder-decoder models"""
+        """Test that `input_ids` work equally as a positional and keyword argument in encoder-decoder models"""
         article = "Justin Timberlake and Jessica Biel, welcome to parenthood."
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py
index 6a564e55242..b4e58fd7a0b 100644
--- a/tests/tensor_parallel/test_tensor_parallel.py
+++ b/tests/tensor_parallel/test_tensor_parallel.py
@@ -35,7 +35,7 @@ if is_torch_available():
 
 class TestTensorParallel(TestCasePlus):
     def torchrun(self, script: str):
-        """Run the `script` using `torchrun` command for multi-processing in a subprocess. Captures errors as necesary."""
+        """Run the `script` using `torchrun` command for multi-processing in a subprocess. Captures errors as necessary."""
         with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp:
             tmp.write(script)
             tmp.flush()
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index fb6860bcc31..248b43c2f8f 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -599,7 +599,7 @@ class TFModelTesterMixin:
             if model.config.is_encoder_decoder:
                 signature = inspect.signature(model.call)
                 arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                if "decoder_head_mask" in arg_names:  # necessary differentiation because of T5 model
                     inputs["decoder_head_mask"] = head_mask
                 if "cross_attn_head_mask" in arg_names:
                     inputs["cross_attn_head_mask"] = head_mask
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 6bbf43e8b92..4b8d774e669 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -241,7 +241,7 @@ def bytes2megabytes(x):
     return int(x / 2**20)
 
 
-# Copied from acclerate: https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py#L40C1-L73C68
+# Copied from accelerate: https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py#L40C1-L73C68
 class TorchTracemalloc:
     def __enter__(self):
         gc.collect()
@@ -4086,7 +4086,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             # Functional check
             self.assertAlmostEqual(loss, orig_loss)
 
-            # AOT Autograd recomputaion and nvfuser recomputation optimization
+            # AOT Autograd recomputation and nvfuser recomputation optimization
             # aggressively fuses the operations and reduce the memory footprint.
             self.assertGreater(orig_peak_mem, peak_mem * 2)
 
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 30dd2ed460c..793225f5ae8 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -186,7 +186,7 @@ class Seq2seqTrainerTester(TestCasePlus):
 
     @require_torch
     def test_bad_generation_config_fail_early(self):
-        # Tests that a bad geneartion config causes the trainer to fail early
+        # Tests that a bad generation config causes the trainer to fail early
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
         tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
         data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 414a0940ce4..4e755f1d4a5 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -436,7 +436,7 @@ NEW_BERT_CONSTANT = "value"
 
             self.init_file(file_name, bert_test)
             duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns)
-            # There should not be a new Copied from statement, the old one should be adapated.
+            # There should not be a new Copied from statement, the old one should be adapted.
             self.check_result(dest_file_name, bert_expected)
 
             self.init_file(file_name, bert_test)
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 1d2682a85b6..b245f279a8e 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -996,7 +996,7 @@ class UtilFunctionTester(unittest.TestCase):
         image = np.random.randint(0, 256, (3, 32, 64))
         self.assertEqual(get_image_size(image), (32, 64))
 
-        # Test the channel dimension can be overriden
+        # Test the channel dimension can be overridden
         image = np.random.randint(0, 256, (3, 32, 64))
         self.assertEqual(get_image_size(image, channel_dim=ChannelDimension.LAST), (3, 32))
 
diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index 9fe7d21b226..233fbcde2ea 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -411,7 +411,7 @@ class RopeTest(unittest.TestCase):
             self.assertEqual(attention_scale, 1.0)
 
         # Check 2: based on `low_freq_factor` and `high_freq_factor`, the frequencies will be scaled between 1 and
-        # `factor` (similar to yarn). Low frequencies get scaled by `factor`, high frequences see no change, medium
+        # `factor` (similar to yarn). Low frequencies get scaled by `factor`, high frequencies see no change, medium
         # frequencies are scaled by a value in between. Changing `low_freq_factor` and `high_freq_factor` changes what
         # is considered low, medium, and high frequencies.
         factor = 10.0
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index c51ca2c4384..71a400579f6 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1686,7 +1686,7 @@ class ModelUtilsTest(TestCasePlus):
     def test_isin_mps_friendly(self):
         """tests that our custom `isin_mps_friendly` matches `torch.isin`"""
         random_ids = torch.randint(0, 100, (100,))
-        # We can match against an interger
+        # We can match against an integer
         random_test_integer = torch.randint(0, 100, (1,)).item()
         self.assertTrue(
             torch.equal(
@@ -1911,7 +1911,7 @@ class ModelUtilsTest(TestCasePlus):
     @require_torch_gpu
     def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float):
         """
-        This test is used to avoid regresion on https://github.com/huggingface/transformers/pull/36380.
+        This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380.
         10s should be more than enough for both models, and allows for some margin as loading time are quite
         unstable. Before #36380, it used to take more than 40s, so 10s is still reasonable.
         Note that we run this test in a subprocess, to ensure that cuda is not already initialized/warmed-up.