From 95113d136508dfef192a29d23344e941735d1a1d Mon Sep 17 00:00:00 2001
From: Yulv-git <34329208+Yulv-git@users.noreply.github.com>
Date: Mon, 11 Jul 2022 17:00:13 +0800
Subject: [PATCH] Fix some typos. (#17560)

* Fix some typos.

Signed-off-by: Yulv-git <yulvchi@qq.com>

* Fix typo.

Signed-off-by: Yulv-git <yulvchi@qq.com>

* make fixup.
---
 docs/source/en/custom_models.mdx                          | 2 +-
 docs/source/en/model_doc/speech_to_text.mdx               | 2 +-
 docs/source/en/testing.mdx                                | 2 +-
 examples/legacy/token-classification/utils_ner.py         | 2 +-
 examples/pytorch/speech-pretraining/README.md             | 2 +-
 examples/research_projects/codeparrot/README.md           | 2 +-
 .../distillation/scripts/token_counts.py                  | 2 +-
 examples/research_projects/jax-projects/README.md         | 2 +-
 .../research_projects/jax-projects/big_bird/evaluate.py   | 2 +-
 .../jax-projects/model_parallel/README.md                 | 2 +-
 src/transformers/generation_flax_utils.py                 | 2 +-
 src/transformers/keras_callbacks.py                       | 2 +-
 src/transformers/models/bart/modeling_tf_bart.py          | 2 +-
 .../models/blenderbot/modeling_tf_blenderbot.py           | 2 +-
 .../blenderbot_small/modeling_tf_blenderbot_small.py      | 2 +-
 .../models/data2vec/modeling_data2vec_audio.py            | 2 +-
 src/transformers/models/detr/modeling_detr.py             | 2 +-
 src/transformers/models/electra/modeling_tf_electra.py    | 2 +-
 src/transformers/models/flava/configuration_flava.py      | 6 +++---
 src/transformers/models/hubert/configuration_hubert.py    | 4 ++--
 src/transformers/models/hubert/modeling_hubert.py         | 2 +-
 src/transformers/models/hubert/modeling_tf_hubert.py      | 2 +-
 src/transformers/models/led/modeling_tf_led.py            | 2 +-
 src/transformers/models/lxmert/modeling_lxmert.py         | 4 ++--
 src/transformers/models/marian/modeling_tf_marian.py      | 2 +-
 src/transformers/models/maskformer/modeling_maskformer.py | 2 +-
 src/transformers/models/mbart/modeling_tf_mbart.py        | 2 +-
 src/transformers/models/pegasus/modeling_tf_pegasus.py    | 2 +-
 src/transformers/models/rag/configuration_rag.py          | 2 +-
 src/transformers/models/sew/configuration_sew.py          | 4 ++--
 src/transformers/models/sew/modeling_sew.py               | 2 +-
 src/transformers/models/sew_d/configuration_sew_d.py      | 4 ++--
 src/transformers/models/sew_d/modeling_sew_d.py           | 2 +-
 .../models/unispeech/configuration_unispeech.py           | 4 ++--
 src/transformers/models/unispeech/modeling_unispeech.py   | 2 +-
 .../models/unispeech_sat/configuration_unispeech_sat.py   | 4 ++--
 .../models/unispeech_sat/modeling_unispeech_sat.py        | 2 +-
 .../models/wav2vec2/modeling_flax_wav2vec2.py             | 2 +-
 src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py  | 2 +-
 src/transformers/models/wav2vec2/modeling_wav2vec2.py     | 2 +-
 .../wav2vec2_conformer/modeling_wav2vec2_conformer.py     | 2 +-
 src/transformers/models/wavlm/configuration_wavlm.py      | 4 ++--
 src/transformers/models/wavlm/modeling_wavlm.py           | 2 +-
 src/transformers/models/yolos/modeling_yolos.py           | 2 +-
 src/transformers/onnx/config.py                           | 2 +-
 src/transformers/training_args.py                         | 6 +++---
 .../adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md | 2 +-
 .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py   | 2 +-
 tests/models/layoutlmv2/test_tokenization_layoutlmv2.py   | 8 ++++++--
 tests/models/layoutlmv3/test_tokenization_layoutlmv3.py   | 8 ++++++--
 tests/models/vit_mae/test_modeling_tf_vit_mae.py          | 2 +-
 tests/pipelines/test_pipelines_token_classification.py    | 2 +-
 tests/test_tokenization_common.py                         | 8 ++++++--
 utils/check_table.py                                      | 2 +-
 54 files changed, 80 insertions(+), 68 deletions(-)

diff --git a/docs/source/en/custom_models.mdx b/docs/source/en/custom_models.mdx
index 05d0a6c2acf..50770be8445 100644
--- a/docs/source/en/custom_models.mdx
+++ b/docs/source/en/custom_models.mdx
@@ -289,7 +289,7 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```
 
-You can then push to to your own namespace (or an organization you are a member of) like this:
+You can then push to your own namespace (or an organization you are a member of) like this:
 
 ```py
 resnet50d.push_to_hub("custom-resnet50d")
diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx
index 0a3b00b1d5d..e11d95442d2 100644
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ b/docs/source/en/model_doc/speech_to_text.mdx
@@ -37,7 +37,7 @@ predicted token ids.
 
 The feature extractor depends on `torchaudio` and the tokenizer depends on `sentencepiece` so be sure to
 install those packages before running the examples. You could either install those as extra speech dependencies with
-`pip install transformers"[speech, sentencepiece]"` or install the packages seperately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
+`pip install transformers"[speech, sentencepiece]"` or install the packages separately with `pip install torchaudio sentencepiece`. Also `torchaudio` requires the development version of the [libsndfile](http://www.mega-nerd.com/libsndfile/) package which can be installed via a system package manager. On Ubuntu it can
 be installed as follows: `apt install libsndfile1-dev`
 
 
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
index a5e2268092e..23c0be7f1ab 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@@ -1226,7 +1226,7 @@ This whole process would have been much easier if we only could set something li
 experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
 Github Actions don't support it at the moment.
 
-You can vote for this feature and see where it is at at these CI-specific threads:
+You can vote for this feature and see where it is at these CI-specific threads:
 
 - [Github Actions:](https://github.com/actions/toolkit/issues/399)
 - [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index e1fb4d18c70..35fcb5ef5b7 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -140,7 +140,7 @@ class TokenClassificationTask:
             # it easier for the model to learn the concept of sequences.
             #
             # For classification tasks, the first vector (corresponding to [CLS]) is
-            # used as as the "sentence vector". Note that this only makes sense because
+            # used as the "sentence vector". Note that this only makes sense because
             # the entire model is fine-tuned.
             tokens += [sep_token]
             label_ids += [pad_token_label_id]
diff --git a/examples/pytorch/speech-pretraining/README.md b/examples/pytorch/speech-pretraining/README.md
index fc8e16623aa..1d57fc8e72d 100644
--- a/examples/pytorch/speech-pretraining/README.md
+++ b/examples/pytorch/speech-pretraining/README.md
@@ -43,7 +43,7 @@ A good metric to observe during training is the gradient norm which should ideal
 
 When training a model on large datasets it is recommended to run the data preprocessing 
 in a first run in a **non-distributed** mode via `--preprocessing_only` so that 
-when running the  model in **distributed** mode in a second step the preprocessed data
+when running the model in **distributed** mode in a second step the preprocessed data
 can easily be loaded on each distributed device.
 
 ---
diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 1eaf063d8ec..50369ab4eb4 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -91,7 +91,7 @@ python scripts/initialize_model.py \
 --model_name codeparrot \
 --push_to_hub True
 ```
-This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the the hub.
+This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
 
 We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
 Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/lvwerra/codeparrot-small/) and [1.5B](https://huggingface.co/lvwerra/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
diff --git a/examples/research_projects/distillation/scripts/token_counts.py b/examples/research_projects/distillation/scripts/token_counts.py
index aa223fda703..736b564ee76 100644
--- a/examples/research_projects/distillation/scripts/token_counts.py
+++ b/examples/research_projects/distillation/scripts/token_counts.py
@@ -43,7 +43,7 @@ if __name__ == "__main__":
     with open(args.data_file, "rb") as fp:
         data = pickle.load(fp)
 
-    logger.info("Counting occurences for MLM.")
+    logger.info("Counting occurrences for MLM.")
     counter = Counter()
     for tk_ids in data:
         counter.update(tk_ids)
diff --git a/examples/research_projects/jax-projects/README.md b/examples/research_projects/jax-projects/README.md
index 56316ef940a..0b3f0dc5d24 100644
--- a/examples/research_projects/jax-projects/README.md
+++ b/examples/research_projects/jax-projects/README.md
@@ -49,7 +49,7 @@ At the end of the community week, each team should submit a demo of their projec
 
 - **23.06.** Official announcement of the community week. Make sure to sign-up in [this google form](https://forms.gle/tVGPhjKXyEsSgUcs8).
 - **23.06. - 30.06.** Participants will be added to an internal Slack channel. Project ideas can be proposed here and groups of 3-5 are formed. Read this document for more information. 
-- **30.06.** Release of all relevant training scripts in JAX/Flax as well as other documents on how to set up a TPU, how to use the training scripts, how to submit a demo, tips & tricks for JAX/Flax, tips & tricks for efficient use of the   hub. 
+- **30.06.** Release of all relevant training scripts in JAX/Flax as well as other documents on how to set up a TPU, how to use the training scripts, how to submit a demo, tips & tricks for JAX/Flax, tips & tricks for efficient use of the hub.
 - **30.06. - 2.07.** Talks about JAX/Flax, TPU, Transformers, Computer Vision & NLP will be held. 
 - **7.07.** Start of the community week! Access to TPUv3-8 will be given to each team.
 - **7.07. - 14.07.** The Hugging Face & JAX/Flax & Cloud team will be available for any questions, problems the teams might run into.
diff --git a/examples/research_projects/jax-projects/big_bird/evaluate.py b/examples/research_projects/jax-projects/big_bird/evaluate.py
index de01e8fc81a..e3309f494e3 100644
--- a/examples/research_projects/jax-projects/big_bird/evaluate.py
+++ b/examples/research_projects/jax-projects/big_bird/evaluate.py
@@ -106,7 +106,7 @@ def main():
         return start_logits, end_logits, jnp.argmax(pooled_logits, axis=-1)
 
     def evaluate(example):
-        # encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length
+        # encode question and context so that they are separated by a tokenizer.sep_token and cut at max_length
         inputs = tokenizer(
             example["question"],
             example["context"],
diff --git a/examples/research_projects/jax-projects/model_parallel/README.md b/examples/research_projects/jax-projects/model_parallel/README.md
index 6b6998b56ad..b63b93862db 100644
--- a/examples/research_projects/jax-projects/model_parallel/README.md
+++ b/examples/research_projects/jax-projects/model_parallel/README.md
@@ -22,7 +22,7 @@ the JAX/Flax backend and the [`pjit`](https://jax.readthedocs.io/en/latest/jax.e
 > Note: The example is experimental and might have bugs. Also currently it only supports single V3-8.
 
 The `partition.py` file defines the `PyTree` of `ParitionSpec` for the GPTNeo model which describes how the model will be sharded.
-The actual sharding is auto-matically handled by `pjit`. The weights are sharded accross all local devices.
+The actual sharding is auto-matically handled by `pjit`. The weights are sharded across all local devices.
 To adapt the script for other models, we need to also change the `ParitionSpec` accordingly.
 
 TODO: Add more explantion.
diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 97c0ce0cc0f..59d66a0fe2b 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -78,7 +78,7 @@ class FlaxBeamSearchOutput(ModelOutput):
         sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
             The generated sequences.
         scores (`jnp.ndarray` of shape `(batch_size,)`):
-            The scores (log probabilites) of the generated sequences.
+            The scores (log probabilities) of the generated sequences.
     """
 
     sequences: jnp.ndarray = None
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 812e932f53b..90042c5e35e 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -277,7 +277,7 @@ class PushToHubCallback(Callback):
             for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
             `"organization_name/model"`.
 
-            Will default to to the name of `output_dir`.
+            Will default to the name of `output_dir`.
         hub_token (`str`, *optional*):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
             `huggingface-cli login`.
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 1e211ee0fcf..e483bafcc58 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -1267,7 +1267,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
         super().__init__(config, *inputs, **kwargs)
         self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 2bede02ab25..cdd68daafea 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -1253,7 +1253,7 @@ class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausal
         super().__init__(config, *inputs, **kwargs)
         self.model = TFBlenderbotMainLayer(config, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 501b3e9df10..1be80e46f38 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -1240,7 +1240,7 @@ class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel
         super().__init__(config, *inputs, **kwargs)
         self.model = TFBlenderbotSmallMainLayer(config, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 63712863bfc..70d802a8015 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -184,7 +184,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 63ea5698f1b..e5d2a04c1f4 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -2054,7 +2054,7 @@ class DetrLoss(nn.Module):
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
 
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        # Compute the average number of target boxes across all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
         # (Niels): comment out function below, distributed training to be added
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 57f17c8a974..2ac72c2371d 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -212,7 +212,7 @@ class TFElectraSelfOutput(tf.keras.layers.Layer):
         return hidden_states
 
 
-# Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra
 class TFElectraAttention(tf.keras.layers.Layer):
     def __init__(self, config: ElectraConfig, **kwargs):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index c42c9008640..6bd782a9eac 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -83,7 +83,7 @@ class FlavaImageConfig(PretrainedConfig):
     >>> # Initializing a FlavaImageModel with  style configuration
     >>> configuration = FlavaImageConfig()
 
-    >>> # Initializing a FlavaImageModel model from the  style configuration
+    >>> # Initializing a FlavaImageModel model from the style configuration
     >>> model = FlavaImageModel(configuration)
 
     >>> # Accessing the model configuration
@@ -212,7 +212,7 @@ class FlavaTextConfig(PretrainedConfig):
     >>> # Initializing a FlavaTextModel with  style configuration
     >>> configuration = FlavaTextConfig()
 
-    >>> # Initializing a FlavaTextConfig from the  style configuration
+    >>> # Initializing a FlavaTextConfig from the style configuration
     >>> model = FlavaTextModel(configuration)
 
     >>> # Accessing the model configuration
@@ -321,7 +321,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
     >>> # Initializing a FlavaMultimodalModel with  style configuration
     >>> configuration = FlavaMultimodalConfig()
 
-    >>> # Initializing a FlavaMultimodalModel model from the  style configuration
+    >>> # Initializing a FlavaMultimodalModel model from the style configuration
     >>> model = FlavaMultimodalModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 621537f493b..be2e6bbf4c7 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -82,10 +82,10 @@ class HubertConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index c2b745f6d5a..d6cb6b8e059 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -174,7 +174,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index bc442ad4ae6..3bc8fdc3c4f 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -203,7 +203,7 @@ def _compute_mask_indices(
     Computes random mask spans for a given shape
 
     Args:
-        shape: the the shape for which to compute masks.
+        shape: the shape for which to compute masks.
             should be of size 2 where first element is batch size and 2nd is timesteps
         attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
         mask_prob:
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 846ba06e0e6..ca8dc26de9d 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -2330,7 +2330,7 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
         super().__init__(config, *inputs, **kwargs)
         self.led = TFLEDMainLayer(config, name="led")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 749fd9ea0c4..6ba852afcb1 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -1110,7 +1110,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
 
     def get_qa_logit_layer(self) -> nn.Module:
         """
-        Returns the the linear layer that produces question answering logits.
+        Returns the linear layer that produces question answering logits.
 
         Returns:
             `nn.Module`: A torch module mapping the question answering prediction hidden states or `None` if LXMERT
@@ -1341,7 +1341,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
 
     def get_qa_logit_layer(self) -> nn.Module:
         """
-        Returns the the linear layer that produces question answering logits
+        Returns the linear layer that produces question answering logits
 
         Returns:
             `nn.Module`: A torch module mapping the question answering prediction hidden states. `None`: A NoneType
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index d356b4f8424..d5e4dfce1c3 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -1283,7 +1283,7 @@ class TFMarianMTModel(TFMarianPreTrainedModel, TFCausalLanguageModelingLoss):
         super().__init__(config, *inputs, **kwargs)
         self.model = TFMarianMainLayer(config, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 2932ee6f73d..1266dbfdad8 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -1912,7 +1912,7 @@ class MaskFormerLoss(nn.Module):
 
     def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
         """
-        Computes the average number of target masks accross the batch, for normalization purposes.
+        Computes the average number of target masks across the batch, for normalization purposes.
         """
         num_masks = sum([len(classes) for classes in class_labels])
         num_masks_pt = torch.as_tensor([num_masks], dtype=torch.float, device=device)
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index b33de11113a..84ce5d7a6c1 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -1280,7 +1280,7 @@ class TFMBartForConditionalGeneration(TFMBartPreTrainedModel, TFCausalLanguageMo
         super().__init__(config, *inputs, **kwargs)
         self.model = TFMBartMainLayer(config, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 578369e774e..04941a24b90 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -1292,7 +1292,7 @@ class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLangua
         super().__init__(config, *inputs, **kwargs)
         self.model = TFPegasusMainLayer(config, name="model")
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 2897642a754..6046b934cd6 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -28,7 +28,7 @@ RAG_CONFIG_DOC = r"""
         title_sep (`str`, *optional*, defaults to  `" / "`):
             Separator inserted between the title and the text of the retrieved document when calling [`RagRetriever`].
         doc_sep (`str`, *optional*, defaults to  `" // "`):
-            Separator inserted between the the text of the retrieved document and the original input when calling
+            Separator inserted between the text of the retrieved document and the original input when calling
             [`RagRetriever`].
         n_docs (`int`, *optional*, defaults to 5):
             Number of documents to retrieve.
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index e9665baeede..c955c0e48fe 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -81,10 +81,10 @@ class SEWConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 16b09bd2af7..632f7d4880f 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -174,7 +174,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index b078623cfda..8461dfef451 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -99,10 +99,10 @@ class SEWDConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 8974bcd6f37..8dc210d06ca 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -175,7 +175,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index 733e68e627b..0c687356de0 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -85,10 +85,10 @@ class UniSpeechConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 744fbe73121..dc194318e99 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -210,7 +210,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index bc8663587d9..3205bbc2cca 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -86,10 +86,10 @@ class UniSpeechSatConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 2ed4f43b515..926464d3bf8 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -224,7 +224,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 7a3c6dfc5d3..68cce7d7d40 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -120,7 +120,7 @@ def _compute_mask_indices(
     CPU as part of the preprocessing during training.
 
     Args:
-        shape: the the shape for which to compute masks.
+        shape: the shape for which to compute masks.
             should be of size 2 where first element is batch size and 2nd is timesteps
         mask_prob:
             probability for each token to be chosen as start of the span to be masked. this will be multiplied by
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 7ca249eac7c..999aae995ae 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -244,7 +244,7 @@ def _compute_mask_indices(
     Computes random mask spans for a given shape
 
     Args:
-        shape: the the shape for which to compute masks.
+        shape: the shape for which to compute masks.
             should be of size 2 where first element is batch size and 2nd is timesteps
         attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
         mask_prob:
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 69ac9005332..9f678080039 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -234,7 +234,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 1cdab6f9041..a8ff9b5b20f 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -231,7 +231,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index 3257d1e986c..7c908d3d730 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -82,10 +82,10 @@ class WavLMConfig(PretrainedConfig):
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
-            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
         conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
-            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
         conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index d945545af48..c792a368cb4 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -183,7 +183,7 @@ def _compute_mask_indices(
     )
     spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-    # add offset to the starting indexes so that that indexes now create a span
+    # add offset to the starting indexes so that indexes now create a span
     offsets = np.arange(mask_length)[None, None, :]
     offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
         batch_size, max_num_masked_span * mask_length
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 0d640212c21..447cec23de9 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -1069,7 +1069,7 @@ class YolosLoss(nn.Module):
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
 
-        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        # Compute the average number of target boxes across all nodes, for normalization purposes
         num_boxes = sum(len(t["class_labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
         # (Niels): comment out function below, distributed training to be added
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index 6097ebf49aa..fdcc12bdcd1 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -487,7 +487,7 @@ class OnnxConfigWithPast(OnnxConfig, ABC):
 
     def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
         """
-        Fill the input_or_ouputs mapping with past_key_values dynamic axes considering.
+        Fill the input_or_outputs mapping with past_key_values dynamic axes considering.
 
         Args:
             inputs_or_outputs: The mapping to fill.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index f65125e348c..603015bf989 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -412,8 +412,8 @@ class TrainingArguments:
             down the training and evaluation speed.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether or not to push the model to the Hub every time the model is saved. If this is activated,
-            `output_dir` will begin a git directory synced with the the repo (determined by `hub_model_id`) and the
-            content will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
+            `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content
+            will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
             [`~Trainer.save_model`] will also trigger a push.
 
             <Tip warning={true}>
@@ -434,7 +434,7 @@ class TrainingArguments:
             `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
             name of `output_dir`.
 
-            Will default to to the name of `output_dir`.
+            Will default to the name of `output_dir`.
         hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
             Defines the scope of what is pushed to the Hub and when. Possible values are:
 
diff --git a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
index 3b2de6f3c09..2066356470f 100644
--- a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
+++ b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
@@ -990,7 +990,7 @@ tokenizer.
 For [camelcase name of model], the tokenizer files can be found here:
 - [To be filled out by mentor]
 
-and having implemented the  🤗Transformers' version of the tokenizer can be loaded as follows:
+and having implemented the 🤗 Transformers' version of the tokenizer can be loaded as follows:
 
 [To be filled out by mentor]
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index c5224bfccb3..487b7c4461b 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -2821,7 +2821,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
         self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
         self.model._set_save_spec(inputs=self.serving.input_signature)
         self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.final_logits_bias = self.add_weight(
             name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
         )
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 78f78c33e7f..049caae6419 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -2183,7 +2183,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
                 total_length = len(sequence["input_ids"])
 
-                self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
+                self.assertGreater(
+                    total_length, 4, "Issue with the testing sequence, please update it, it's too short"
+                )
 
                 # Test with max model input length
                 model_max_length = tokenizer.model_max_length
@@ -2193,7 +2195,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
                 total_length1 = len(sequence1["input_ids"])
                 self.assertGreater(
-                    total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
+                    total_length1,
+                    model_max_length,
+                    "Issue with the testing sequence, please update it, it's too short",
                 )
 
                 # Simple
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index ae12129e787..239939ca269 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -2097,7 +2097,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)
                 total_length = len(sequence["input_ids"])
 
-                self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
+                self.assertGreater(
+                    total_length, 4, "Issue with the testing sequence, please update it, it's too short"
+                )
 
                 # Test with max model input length
                 model_max_length = tokenizer.model_max_length
@@ -2107,7 +2109,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)
                 total_length1 = len(sequence1["input_ids"])
                 self.assertGreater(
-                    total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
+                    total_length1,
+                    model_max_length,
+                    "Issue with the testing sequence, please update it, it's too short",
                 )
 
                 # Simple
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 465b30c5cde..906c79e766f 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -281,7 +281,7 @@ class TFViTMAEModelTest(TFModelTesterMixin, unittest.TestCase):
         super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
 
     # overwrite from common since TFViTMAEForPretraining outputs loss along with
-    # logits and mask indices. loss and mask indicies are not suitable for integration
+    # logits and mask indices. loss and mask indices are not suitable for integration
     # with other keras modules.
     def test_compile_tf_model(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 26cfa0d3be3..45916ec31df 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -278,7 +278,7 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
         NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
         model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
         tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, use_fast=True)
-        sentence = """Enzo works at the the UN"""
+        sentence = """Enzo works at the UN"""
         token_classifier = pipeline("ner", model=model, tokenizer=tokenizer)
         output = token_classifier(sentence)
         self.assertEqual(
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 2abff6bda98..7e1a6c7945d 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -990,7 +990,9 @@ class TokenizerTesterMixin:
                 sequence = tokenizer.encode(seq_0, add_special_tokens=False)
                 total_length = len(sequence)
 
-                self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
+                self.assertGreater(
+                    total_length, 4, "Issue with the testing sequence, please update it, it's too short"
+                )
 
                 # Test with max model input length
                 model_max_length = tokenizer.model_max_length
@@ -1000,7 +1002,9 @@ class TokenizerTesterMixin:
                 sequence1 = tokenizer(seq_1, add_special_tokens=False)
                 total_length1 = len(sequence1["input_ids"])
                 self.assertGreater(
-                    total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
+                    total_length1,
+                    model_max_length,
+                    "Issue with the testing sequence, please update it, it's too short",
                 )
 
                 # Simple
diff --git a/utils/check_table.py b/utils/check_table.py
index d59f3e7b1e5..96d0cf23d26 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -53,7 +53,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
     return "".join(lines[start_index:end_index]), start_index, end_index, lines
 
 
-# Add here suffixes that are used to identify models, seperated by |
+# Add here suffixes that are used to identify models, separated by |
 ALLOWED_MODEL_SUFFIXES = "Model|Encoder|Decoder|ForConditionalGeneration"
 # Regexes that match TF/Flax/PT model names.
 _re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")