diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7fd676c761a..4e6718005ab 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -209,6 +209,7 @@ jobs:
             - run: make deps_table_check_updated
             - run: python utils/update_metadata.py --check-only
             - run: python utils/check_task_guides.py
+            - run: python utils/check_docstrings.py
 
 workflows:
     version: 2
diff --git a/Makefile b/Makefile
index 2c2f3786f7c..0c51598594c 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,7 @@ repo-consistency:
 	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
 	python utils/check_task_guides.py
+	python utils/check_docstrings.py
 
 # this target runs checks on all files
 
@@ -82,6 +83,7 @@ fix-copies:
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_task_guides.py --fix_and_overwrite
+	python utils/check_docstrings.py --fix_and_overwrite
 
 # Run tests for the library
 
diff --git a/docs/source/en/pr_checks.md b/docs/source/en/pr_checks.md
index c5a2e539c02..f50cede3264 100644
--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@@ -124,6 +124,7 @@ This checks that:
 - The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
 - The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
 - The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
+- All docstrings properly document the arguments in the signature of the object (performed by `utils/check_docstrings.py`)
 
 Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command
 
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 74086ca2d7f..c718fc53231 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -47,6 +47,7 @@ _re_configuration_file = re.compile(r"config\.(.*)\.json")
 
 
 class PretrainedConfig(PushToHubMixin):
+    # no-format
     r"""
     Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
     methods for loading/downloading/saving configurations.
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index a68ee3b83d9..57879ec3e2d 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -90,7 +90,7 @@ class DefaultDataCollator(DataCollatorMixin):
     helpful if you need to set a return_tensors value at initialization.
 
     Args:
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -235,7 +235,7 @@ class DataCollatorWithPadding:
 
             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
             7.5 (Volta).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -288,7 +288,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
             7.5 (Volta).
         label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -521,7 +521,7 @@ class DataCollatorForSeq2Seq:
     Args:
         tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        model ([`PreTrainedModel`]):
+        model ([`PreTrainedModel`], *optional*):
             The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
             prepare the *decoder_input_ids*
 
@@ -544,7 +544,7 @@ class DataCollatorForSeq2Seq:
             7.5 (Volta).
         label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
-        return_tensors (`str`):
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 838827f8c5c..77123fc3ec9 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -65,7 +65,7 @@ class BatchFeature(UserDict):
     This class is derived from a python dictionary and can be used as a dictionary.
 
     Args:
-        data (`dict`):
+        data (`dict`, *optional*):
             Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
diff --git a/src/transformers/generation/beam_constraints.py b/src/transformers/generation/beam_constraints.py
index 2563ac23cd0..b53c4512427 100644
--- a/src/transformers/generation/beam_constraints.py
+++ b/src/transformers/generation/beam_constraints.py
@@ -263,8 +263,9 @@ class DisjunctiveConstraint(Constraint):
     A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
 
     Args:
-        nested_token_ids (`List[List[int]]`): a list of words, where each word is a list of ids. This constraint
-        is fulfilled by generating just one from the list of words.
+        nested_token_ids (`List[List[int]]`):
+            A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
+            the list of words.
     """
 
     def __init__(self, nested_token_ids: List[List[int]]):
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index cf729bb45af..03334b6b614 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -152,7 +152,7 @@ class BeamSearchScorer(BeamScorer):
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformer.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`):
+        num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
         max_length (`int`, *optional*):
@@ -437,7 +437,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformer.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`):
+        num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
         max_length (`int`, *optional*):
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 94d0f823ed8..18ccdb2835b 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -38,6 +38,7 @@ METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash"
 
 
 class GenerationConfig(PushToHubMixin):
+    # no-format
     r"""
     Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
     for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index e6b45ded804..5c30b92755a 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -120,7 +120,7 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -163,7 +163,7 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 8f482ad8af1..14f772ab6c9 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -357,7 +357,7 @@ class TopPLogitsWarper(LogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -419,7 +419,7 @@ class TopKLogitsWarper(LogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -447,9 +447,9 @@ class TypicalLogitsWarper(LogitsWarper):
     Generation](https://arxiv.org/abs/2202.00666) for more information.
 
     Args:
-        mass (`float`):
+        mass (`float`, *optional*, defaults to 0.9):
             Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -499,7 +499,7 @@ class EpsilonLogitsWarper(LogitsWarper):
     Args:
         epsilon (`float`):
             If set to > 0, only the most tokens with probabilities `epsilon` or higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -572,7 +572,7 @@ class EtaLogitsWarper(LogitsWarper):
         epsilon (`float`):
             A float value in the range (0, 1). Hyperparameter used to calculate the dynamic cutoff value, `eta`. The
             suggested values from the paper ranges from 3e-4 to 4e-3 depending on the size of the model.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All values that are found to be below the dynamic cutoff value, `eta`, are set to this float value. This
             parameter is useful when logits need to be modified for very low probability tokens that should be excluded
             from generation entirely.
@@ -1600,18 +1600,15 @@ class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
             Higher guidance scale encourages the model to generate samples that are more closely linked to the input
             prompt, usually at the expense of poorer quality. A value smaller than 1 has the opposite effect, while
             making the negative prompt provided with negative_prompt_ids (if any) act as a positive prompt.
-        unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
-            the last token of the prompt.
-        unconditional_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, **optional**):
-            Attention mask for unconditional_ids.
         model (`PreTrainedModel`):
             The model computing the unconditional scores. Supposedly the same as the one computing the conditional
             scores. Both models must use the same tokenizer.
-        smooth_factor (`float`, **optional**):
-            The interpolation weight for CFG Rescale. 1 means no rescaling, 0 reduces to the conditional scores without
-            CFG. Turn it lower if the output degenerates.
-        use_cache (`bool`, **optional**):
+        unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
+            the last token of the prompt.
+        unconditional_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Attention mask for unconditional_ids.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether to cache key/values during the negative prompt forward pass.
 
 
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 4e0a294e7c3..8929bacd84a 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -49,7 +49,7 @@ class MaxLengthCriteria(StoppingCriteria):
     Args:
         max_length (`int`):
             The maximum length that the output sequence can have in number of tokens.
-        max_position_embeddings (`int`, `optional`):
+        max_position_embeddings (`int`, *optional*):
             The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
     """
 
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 02e33caf79a..fc9799b7ab3 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -122,7 +122,7 @@ class TFTopKLogitsWarper(TFLogitsWarper):
     Args:
         top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
@@ -151,7 +151,7 @@ class TFTopPLogitsWarper(TFLogitsWarper):
         top_p (`float`):
             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
             higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to -inf):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 818a4f7c4a2..74cfbfbe338 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -71,6 +71,8 @@ class AlignTextConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
             Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
             positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
@@ -80,8 +82,6 @@ class AlignTextConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 0)
-            Padding token id.
 
     Example:
 
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index c944345e866..431c61565ba 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -259,7 +259,7 @@ class AltCLIPConfig(PretrainedConfig):
             Dictionary of configuration options used to initialize [`AltCLIPTextConfig`].
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`AltCLIPVisionConfig`].
-        projection_dim (`int`, *optional*, defaults to 512):
+        projection_dim (`int`, *optional*, defaults to 768):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 4cbf7fe192f..102535bc5b0 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -30,9 +30,9 @@ class AltCLIPProcessor(ProcessorMixin):
     the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLIPImageProcessor`]):
+        image_processor ([`CLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 22b0ca70ac8..23a2d83e78a 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -51,15 +51,15 @@ class ASTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 7084c8b5a93..2b381327592 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -38,7 +38,7 @@ class BarkProcessor(ProcessorMixin):
     Args:
         tokenizer ([`PreTrainedTokenizer`]):
             An instance of [`PreTrainedTokenizer`].
-        speaker_embeddings (`Dict[Dict[str]]`, *optional*, defaults to `None`):
+        speaker_embeddings (`Dict[Dict[str]]`, *optional*):
             Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
             `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
             embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 5fd851b379c..586801eed86 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -97,8 +97,6 @@ class BarthezTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index 74e6ad8f9e2..6b9dc266b29 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -92,8 +92,6 @@ class BartphoTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index ef7bf22b918..e554f45f791 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -41,7 +41,7 @@ class BeitConfig(PretrainedConfig):
     [microsoft/beit-base-patch16-224-pt22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k) architecture.
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 8092):
+        vocab_size (`int`, *optional*, defaults to 8192):
             Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
             pre-training.
         hidden_size (`int`, *optional*, defaults to 768):
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 930934bbefd..6f8ce403e0a 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -57,7 +57,7 @@ class BeitImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -67,12 +67,12 @@ class BeitImageProcessor(BaseImageProcessor):
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
             Can be overridden by the `crop_size` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 13846a5089a..75975680dde 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -77,7 +77,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
             Path to the vocabulary file.
         merges_file (`str`):
             Path to the merges file.
-        normalization (`bool`, *optional*, defaults to `False`)
+        normalization (`bool`, *optional*, defaults to `False`):
             Whether or not to apply a normalization preprocess.
         bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 8e720a54257..12041a4ce11 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -60,25 +60,25 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The begin of sequence token.
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
         sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py
index 2fe46354d29..b6911e2ef90 100644
--- a/src/transformers/models/biogpt/configuration_biogpt.py
+++ b/src/transformers/models/biogpt/configuration_biogpt.py
@@ -72,13 +72,14 @@ class BioGptConfig(PretrainedConfig):
             Please refer to the paper about LayerDrop: https://arxiv.org/abs/1909.11556 for further details
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        pad_token_id (`int`, *optional*, defaults to 1)
+        pad_token_id (`int`, *optional*, defaults to 1):
             Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 0)
+        bos_token_id (`int`, *optional*, defaults to 0):
             Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2)
+        eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
-        Example:
+
+    Example:
 
     ```python
     >>> from transformers import BioGptModel, BioGptConfig
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index bfac3ab03f0..1e5ded1e191 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -52,7 +52,7 @@ class BitConfig(BackboneConfigMixin, PretrainedConfig):
             are supported.
         global_padding (`str`, *optional*):
             Padding strategy to use for the convolutional layers. Can be either `"valid"`, `"same"`, or `None`.
-        num_groups (`int`, *optional*, defaults to `32`):
+        num_groups (`int`, *optional*, defaults to 32):
             Number of groups used for the `BitGroupNormActivation` layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The drop path rate for the stochastic depth.
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 61c56738ac4..fb8086e981a 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -85,9 +85,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"__unk__"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"__pad__"`):
+        pad_token (`str`, *optional*, defaults to `"__null__"`):
             The token used for padding, for example when batching sequences of different lengths.
-        **kwargs
+        kwargs (*optional*):
             Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index a0f2e397b22..39760a7e22a 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -295,7 +295,7 @@ class BlipConfig(PretrainedConfig):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
-        image_text_hidden_size (`int`, *optional*, defaults to 768):
+        image_text_hidden_size (`int`, *optional*, defaults to 256):
             Dimentionality of the hidden state of the image-text fusion layer.
         kwargs (*optional*):
             Dictionary of keyword arguments.
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 9cee3faee32..fd8873cb7a4 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -53,7 +53,7 @@ class BlipImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 0a2a289b741..1e2b8ea40b0 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -128,14 +128,14 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
             `do_resize` parameter in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `288`):
+        size (`Dict[str, int]` *optional*, defaults to 288):
             Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under
             `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if
             `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method.
         size_divisor (`int`, *optional*, defaults to 32):
             The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
             is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
index 1b2b12d5b19..77b73e48b90 100644
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@@ -31,7 +31,7 @@ class BrosProcessor(ProcessorMixin):
     [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.
 
     Args:
-        tokenizer (`BertTokenizerFast`):
+        tokenizer (`BertTokenizerFast`, *optional*):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["tokenizer"]
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index c0c3aa56e99..68c70db0d18 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -48,7 +48,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
             token instead.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (`int`, *optional*, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 125):
             Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
             accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
             indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index f75a397755e..5a23d9b73b9 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED']`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 42986e2c347..fbd4d579df9 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -31,9 +31,9 @@ class ChineseCLIPProcessor(ProcessorMixin):
     See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`ChineseCLIPImageProcessor`]):
+        image_processor ([`ChineseCLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`]):
+        tokenizer ([`BertTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 7a198b40160..fca9b0087c8 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -227,7 +227,7 @@ class ClapAudioConfig(PretrainedConfig):
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        layer_norm_eps (`[type]`, *optional*, defaults to `1e-5`):
+        layer_norm_eps (`[type]`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
@@ -345,10 +345,10 @@ class ClapConfig(PretrainedConfig):
             Dictionary of configuration options used to initialize [`ClapTextConfig`].
         audio_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`ClapAudioConfig`].
+        logit_scale_init_value (`float`, *optional*, defaults to 14.29):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and audio projection layers.
-        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
             Activation function for the projection layers.
         initializer_factor (`float`, *optional*, defaults to 1.0):
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index b6141e00b76..1b7c2844400 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -41,32 +41,32 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
     Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
 
     Args:
-        feature_size (`int`, defaults to 64):
+        feature_size (`int`, *optional*, defaults to 64):
             The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
             (`n_mels`).
-        sampling_rate (`int`, defaults to 48_000):
+        sampling_rate (`int`, *optional*, defaults to 48000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
             to warn users if the audio fed to the feature extractor does not have the same sampling rate.
-        hop_length (`int`, defaults to 480):
+        hop_length (`int`,*optional*, defaults to 480):
             Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
             in smaller `frames` with a step of `hop_length` between each frame.
-        max_length_s (`int`, defaults to 10):
+        max_length_s (`int`, *optional*, defaults to 10):
             The maximum input length of the model in seconds. This is used to pad the audio.
-        fft_window_size (`int`, defaults to 1024):
+        fft_window_size (`int`, *optional*, defaults to 1024):
             Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
             resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the attention masks coresponding to the input.
-        frequency_min (`float`, *optional*, default to 0):
+        frequency_min (`float`, *optional*, defaults to 0):
             The lowest frequency of interest. The STFT will not be computed for values below this.
-        frequency_max (`float`, *optional*, default to 14_000):
+        frequency_max (`float`, *optional*, defaults to 14000):
             The highest frequency of interest. The STFT will not be computed for values above this.
         top_db (`float`, *optional*):
             The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
             `audio_utils.power_to_db` function
-        truncation (`str`, *optional*, default to `"fusions"`):
+        truncation (`str`, *optional*, defaults to `"fusion"`):
             Truncation pattern for long audio inputs. Two patterns are available:
                 - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
                   downsampled version of the entire mel spectrogram.
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 291fd55674a..f083380e6ad 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -30,9 +30,9 @@ class CLIPProcessor(ProcessorMixin):
     [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLIPImageProcessor`]):
+        image_processor ([`CLIPImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index e53229840b6..86686002685 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -255,7 +255,7 @@ class CLIPSegConfig(PretrainedConfig):
             Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
             The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
-        extract_layers (`List[int]`, *optional*, defaults to [3, 6, 9]):
+        extract_layers (`List[int]`, *optional*, defaults to `[3, 6, 9]`):
             Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
         reduce_dim (`int`, *optional*, defaults to 64):
             Dimensionality to reduce the CLIP vision embedding.
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index e42e18d0e66..bc1d36a1c66 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -30,9 +30,9 @@ class CLIPSegProcessor(ProcessorMixin):
     [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`ViTImageProcessor`]):
+        image_processor ([`ViTImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 62fb1bc1e72..09944527bbb 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -64,7 +64,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
         crop_pct (`float` *optional*, defaults to 224 / 256):
             Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
             overriden by `crop_pct` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
diff --git a/src/transformers/models/cpmant/configuration_cpmant.py b/src/transformers/models/cpmant/configuration_cpmant.py
index 56ba9ab31cf..bd85244c81f 100644
--- a/src/transformers/models/cpmant/configuration_cpmant.py
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@@ -50,15 +50,17 @@ class CpmAntConfig(PretrainedConfig):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 48):
             Number of layers of the Transformer encoder.
-        dropout_p (`float`, *optional*, defaults to 0.1):
+        dropout_p (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder.
         position_bias_num_buckets (`int`, *optional*, defaults to 512):
             The number of position_bias buckets.
         position_bias_max_distance (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        eps (`float`, *optional*, defaults to 1e-6):
+        eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 1.0):
+            Initialize parameters with std = init_std.
         prompt_types (`int`, *optional*, defaults to 32):
             The type of prompt.
         prompt_length (`int`, *optional*, defaults to 32):
@@ -67,8 +69,6 @@ class CpmAntConfig(PretrainedConfig):
             The type of segment.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether to use cache.
-        init_std (`float`, *optional*, defaults to 1.0):
-            Initialize parameters with std = init_std.
 
     Example:
 
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index 7351c21e58a..553e919b4a7 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -54,7 +54,7 @@ class CTRLConfig(PretrainedConfig):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
             The epsilon to use in the layer normalization layers
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index d77f0b39b98..6d157fdf3c7 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -99,9 +99,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
     refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
         tokenizer_file (`str`, *optional*):
             The path to a tokenizer file to use instead of the vocab file.
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index b395afdbef5..905473c13eb 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -58,23 +58,23 @@ class DeiTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
+        encoder_stride (`int`, *optional*, defaults to 16):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index c10c44ba91e..96425278adb 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -52,19 +52,19 @@ class DeiTImageProcessor(BaseImageProcessor):
             `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling` filter, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
             is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
diff --git a/src/transformers/models/deprecated/mctct/configuration_mctct.py b/src/transformers/models/deprecated/mctct/configuration_mctct.py
index 4797a77d29e..e91104112b6 100644
--- a/src/transformers/models/deprecated/mctct/configuration_mctct.py
+++ b/src/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -53,7 +53,7 @@ class MCTCTConfig(PretrainedConfig):
             Dimensions of each attention head for each attention layer in the Transformer encoder.
         max_position_embeddings (`int`, *optional*, defaults to 920):
             The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layerdrop (`float`, *optional*, defaults to 0.3):
             The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
@@ -63,9 +63,9 @@ class MCTCTConfig(PretrainedConfig):
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
             The dropout ratio for the attention probabilities.
         pad_token_id (`int`, *optional*, defaults to 1):
             The tokenizer index of the pad token.
@@ -80,17 +80,17 @@ class MCTCTConfig(PretrainedConfig):
             The probability of randomly dropping the `Conv1dSubsampler` layer during training.
         num_conv_layers (`int`, *optional*, defaults to 1):
             Number of convolution layers before applying transformer encoder layers.
-        conv_kernel (`List[int]`, *optional*, defaults to `[7]`):
+        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
             The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
             to `num_conv_layers`.
-        conv_stride (`List[int]`, *optional*, defaults to `[3]`):
+        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
             The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
             to `num_conv_layers`.
         input_feat_per_channel (`int`, *optional*, defaults to 80):
             Feature dimensions of the channels of the input to the Conv1D layer.
         input_channels (`int`, *optional*, defaults to 1):
             Number of input channels of the input to the Conv1D layer.
-        conv_channels (`List[int]`, *optional*, defaults to None):
+        conv_channels (`List[int]`, *optional*):
             Channel sizes of intermediate Conv1D layers.
         ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
             Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
diff --git a/src/transformers/models/deprecated/van/configuration_van.py b/src/transformers/models/deprecated/van/configuration_van.py
index 798c8c19444..70942ad645b 100644
--- a/src/transformers/models/deprecated/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -57,9 +57,9 @@ class VanConfig(PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-2):
+        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
             The initial value for layer scaling.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The dropout probability for stochastic depth.
diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py
index 963c72f29bd..b70797b55c3 100644
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -44,9 +44,9 @@ class DinatConfig(BackboneConfigMixin, PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 64):
             Dimensionality of patch embedding.
-        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
             Number of layers in each level of the encoder.
-        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+        num_heads (`List[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
             Number of attention heads in each layer of the Transformer encoder.
         kernel_size (`int`, *optional*, defaults to 7):
             Neighborhood Attention kernel size.
@@ -67,7 +67,7 @@ class DinatConfig(BackboneConfigMixin, PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layer_scale_init_value (`float`, *optional*, defaults to 0.0):
             The initial value for the layer scale. Disabled if <=0.
diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py
index 82cfcaaa19d..4c3c26623a3 100644
--- a/src/transformers/models/dinov2/configuration_dinov2.py
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -60,7 +60,7 @@ class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index 059016dafef..c7d6792467f 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -45,15 +45,15 @@ class DonutSwinConfig(PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depth of each layer in the Transformer encoder.
-        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
             Number of attention heads in each layer of the Transformer encoder.
         window_size (`int`, *optional*, defaults to 7):
             Size of windows.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of MLP hidden dimensionality to embedding dimensionality.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether or not a learnable bias should be added to the queries, keys and values.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings and encoder.
@@ -64,11 +64,11 @@ class DonutSwinConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
 
     Example:
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index c975b95c05f..f797aec18ed 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -32,9 +32,9 @@ class DonutProcessor(ProcessorMixin):
     [`~DonutProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`DonutImageProcessor`]):
+        image_processor ([`DonutImageProcessor`], *optional*):
             An instance of [`DonutImageProcessor`]. The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
             An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 7f2dd2e807b..45acd5902f5 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -52,9 +52,9 @@ class DPTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -66,6 +66,8 @@ class DPTConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
+        is_hybrid (`bool`, *optional*, defaults to `False`):
+            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
@@ -79,11 +81,9 @@ class DPTConfig(PretrainedConfig):
             - "project" passes information to the other tokens by concatenating the readout to all other tokens before
               projecting the
             representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
-        is_hybrid (`bool`, *optional*, defaults to `False`):
-            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
         reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
             The up/downsampling factors of the reassemble layers.
-        neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]):
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
             The hidden sizes to project to for the feature maps of the backbone.
         fusion_hidden_size (`int`, *optional*, defaults to 256):
             The number of channels before fusion.
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 3c48cdaf781..93374dbd925 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -100,14 +100,14 @@ class DPTImageProcessor(BaseImageProcessor):
             Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
             Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
         keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
             If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
             be overidden by `keep_aspect_ratio` in `preprocess`.
         ensure_multiple_of (`int`, *optional*, defaults to 1):
             If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
             by `ensure_multiple_of` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
             `preprocess`.
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 4661618ed52..5f75d1692e8 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -52,22 +52,22 @@ class EfficientNetImageProcessor(BaseImageProcessor):
             `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
             Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`):
+        resample (`PILImageResampling` filter, *optional*, defaults to 0):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_center_crop (`bool`, *optional*, defaults to `False`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
             is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
             Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
         rescale_offset (`bool`, *optional*, defaults to `False`):
             Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
             overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
index eccec82bf8b..fce21b146cf 100644
--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -46,13 +46,13 @@ class FalconConfig(PretrainedConfig):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 71):
             Number of attention heads for each attention layer in the Transformer encoder.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether the model should return the last key/values attentions (not used by all models). Only relevant if
             `config.is_decoder=True`.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
         hidden_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for MLP layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index 010515e9d02..b1b34cc0f78 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -207,7 +207,7 @@ class FlaubertTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<special1>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
             List of additional special tokens.
         lang2id (`Dict[str, int]`, *optional*):
             Dictionary mapping languages string identifiers to their IDs.
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index 2dd00618c5a..4125d912622 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -52,9 +52,9 @@ class FlavaImageConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -291,7 +291,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 6):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
@@ -300,9 +300,9 @@ class FlavaMultimodalConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 13145419356..1736257a355 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -33,8 +33,8 @@ class FlavaProcessor(ProcessorMixin):
     [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`FlavaImageProcessor`]): The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "FlavaImageProcessor"
diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py
index f4bcd0ddce3..83540c0f349 100644
--- a/src/transformers/models/focalnet/configuration_focalnet.py
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -67,7 +67,7 @@ class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
             Stochastic depth rate.
         use_layerscale (`bool`, *optional*, defaults to `False`):
             Whether to use layer scale in the encoder.
-        layerscale_value (`float`, *optional*, defaults to 1e-4):
+        layerscale_value (`float`, *optional*, defaults to 0.0001):
             The initial value of the layer scale.
         use_post_layernorm (`bool`, *optional*, defaults to `False`):
             Whether to use post layer normalization in the encoder.
@@ -77,9 +77,9 @@ class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to normalize the modulator.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 168aa14ead7..a631f074764 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -146,13 +146,13 @@ class FSMTTokenizer(PreTrainedTokenizer):
     this superclass for more information regarding those methods.
 
     Args:
-        langs (`List[str]`):
+        langs (`List[str]`, *optional*):
             A list of two languages to translate from and to, for instance `["en", "ru"]`.
-        src_vocab_file (`str`):
+        src_vocab_file (`str`, *optional*):
             File containing the vocabulary for the source language.
-        tgt_vocab_file (`st`):
+        tgt_vocab_file (`st`, *optional*):
             File containing the vocabulary for the target language.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             File containing the merges.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index 8d87ae23a77..d049b15911b 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -81,7 +81,7 @@ class FunnelConfig(PretrainedConfig):
             The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
             linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
             linear layers.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-9):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-09):
             The epsilon used by the layer normalization layers.
         pooling_type (`str`, *optional*, defaults to `"mean"`):
             Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
@@ -90,10 +90,10 @@ class FunnelConfig(PretrainedConfig):
             is faster on TPU.
         separate_cls (`bool`, *optional*, defaults to `True`):
             Whether or not to separate the cls token when applying pooling.
-        truncate_seq (`bool`, *optional*, defaults to `False`):
+        truncate_seq (`bool`, *optional*, defaults to `True`):
             When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
             sequence length that is not a multiple of 2.
-        pool_q_only (`bool`, *optional*, defaults to `False`):
+        pool_q_only (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
     """
     model_type = "funnel"
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index a0f9ced1b74..9b0d3c1b6c5 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -120,9 +120,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        bos_token (`str`, `optional`, defaults to `"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (`str`, `optional`, defaults to `"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py
index 9951d1615cc..63056c4c04f 100644
--- a/src/transformers/models/glpn/configuration_glpn.py
+++ b/src/transformers/models/glpn/configuration_glpn.py
@@ -51,7 +51,7 @@ class GLPNConfig(PretrainedConfig):
             Patch size before each encoder block.
         strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
             Stride before each encoder block.
-        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
         mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
@@ -67,9 +67,9 @@ class GLPNConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         drop_path_rate (`float`, *optional*, defaults to 0.1):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
-        decoder_hidden_size (`int`, *optional*, defaults to 32):
+        decoder_hidden_size (`int`, *optional*, defaults to 64):
             The dimension of the decoder.
         max_depth (`int`, *optional*, defaults to 10):
             The maximum depth of the decoder.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 15e30dcbce4..afed9188f7a 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -48,7 +48,7 @@ class GLPNImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
             multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`.
-        resample (`PIL.Image` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index ea1c37af219..9b84b18e26c 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -54,7 +54,7 @@ class GPTNeoConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         num_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
+        attention_types (`List`, *optional*, defaults to `[[['global', 'local'], 12]]`):
             The type of attention for each layer in a `List` of the following format `[[["attention_type"],
             num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
             value of `attention_type` from `["global", "local"]`
@@ -76,7 +76,7 @@ class GPTNeoConfig(PretrainedConfig):
         classifier_dropout (`float`, *optional*, defaults to 0.1):
             Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The
             dropout ratio for the hidden layer.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index a1a5c71e966..857656fa07c 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -64,17 +64,17 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
         keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
-            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
-        eos_token (`str`, *optional*):
-            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
-        unk_token (`str`, *optional*):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. If not provided, will default to '<unk>'.
         pad_token (`str`, *optional*):
             The token used for padding, for example when batching sequences of different lengths. If not provided, will
             default to '<pad>' or '<unk>' depending on model size.
+        unk_token (`str`, *optional*):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. If not provided, will default to '<unk>'.
+        eos_token (`str`, *optional*):
+            The end of sequence token seen during pretraining. If not provided, will default to '<|endoftext|>'
+        bos_token (`str`, *optional*):
+            The beginning of sequence token that can be used for downstream task, was not seen during pretraining. If
+            not provided, will default to '<s>' or '<|endoftext|>', depending on model size.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
index cd05ccde9ff..e805acf3c74 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -139,7 +139,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
             The token used for unknown charactor
         pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
             The token used for padding
-        bos_token (`str`, *optional*, defaults to `"<|startoftext|>""`):
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
             The beginning of sequence token.
         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py
index f870147f160..ee8dfbb4077 100644
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -53,10 +53,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
     Constructs a Idefics image processor.
 
     Args:
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             Resize to image size
-        image_num_channels (`int`, *optional*, defaults to `3`):
-            Number of image channels.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
@@ -65,6 +63,8 @@ class IdeficsImageProcessor(BaseImageProcessor):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
+        image_num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
     """
 
     model_input_names = ["pixel_values"]
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 2d317226b75..ad421c91053 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -70,7 +70,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
             `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
             Size of the image after resizing. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index 0500e1cbeb1..78c7e4e8b65 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -57,7 +57,7 @@ class InstructBlipVisionConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer
             normalization layers.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index 92883b124c9..0ca51e6d579 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -83,8 +83,6 @@ class LayoutLMConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the 2D position embedding might ever used. Typically set this to something large
             just in case (e.g., 1024).
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index a5f8d7c2ce4..b1e6c0731d2 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -100,7 +100,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
             overridden by `do_resize` in `preprocess`.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the image after resizing. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         apply_ocr (`bool`, *optional*, defaults to `True`):
@@ -109,7 +109,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
         ocr_lang (`str`, *optional*):
             The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
             used. Can be overridden by `ocr_lang` in `preprocess`.
-        tesseract_config (`str`, *optional*):
+        tesseract_config (`str`, *optional*, defaults to `""`):
             Any additional custom configuration flags that are forwarded to the `config` parameter when calling
             Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
     """
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index f9990128ff0..fe52c16fd25 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -38,9 +38,9 @@ class LayoutLMv2Processor(ProcessorMixin):
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv2ImageProcessor`):
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
             An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
+        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
             An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 04e50562f83..31d0c5e60a5 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -38,9 +38,9 @@ class LayoutLMv3Processor(ProcessorMixin):
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv3ImageProcessor`):
+        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
             An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
+        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
             An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 199b906eedc..4d3d1078db6 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -253,7 +253,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
         cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 1fd621f8883..b1d885255b7 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -37,9 +37,9 @@ class LayoutXLMProcessor(ProcessorMixin):
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        image_processor (`LayoutLMv2ImageProcessor`):
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
             An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
+        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
             An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
     """
 
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 230be65ee62..535ddb254ea 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -203,8 +203,6 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
             CrossEntropyLoss.
         only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index b43f24d51a9..77de1ec3336 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -56,7 +56,7 @@ class LevitImageProcessor(BaseImageProcessor):
             edge value `c` is rescaled to `int(c * (256/224))`. The smaller edge of the image will be matched to this
             value i.e, if height > width, then image will be rescaled to `(size["shortest_egde"] * height / width,
             size["shortest_egde"])`. Can be overridden by the `size` parameter in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -74,10 +74,10 @@ class LevitImageProcessor(BaseImageProcessor):
         do_normalize (`bool`, *optional*, defaults to `True`):
             Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
             `preprocess` method.
-        image_mean (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
+        image_mean (`List[int]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
+        image_std (`List[int]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
     """
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index dc200c8e8f5..6ced7d2acad 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -43,14 +43,18 @@ class LxmertConfig(PretrainedConfig):
             `inputs_ids` passed when calling [`LxmertModel`] or [`TFLxmertModel`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        r_layers (`int`, *optional*, defaults to 5):
-            Number of hidden layers in the Transformer visual encoder.
-        l_layers (`int`, *optional*, defaults to 9):
-            Number of hidden layers in the Transformer language encoder.
-        x_layers (`int`, *optional*, defaults to 5):
-            Number of hidden layers in the Transformer cross modality encoder.
-        num_attention_heads (`int`, *optional*, defaults to 5):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_qa_labels (`int`, *optional*, defaults to 9500):
+            This represents the total number of different question answering (QA) labels there are. If using more than
+            one dataset with QA, the user will need to account for the total number of labels that all of the datasets
+            have in total.
+        num_object_labels (`int`, *optional*, defaults to 1600):
+            This represents the total number of semantically unique objects that lxmert will be able to classify a
+            pooled-object feature as belonging too.
+        num_attr_labels (`int`, *optional*, defaults to 400):
+            This represents the total number of semantically unique attributes that lxmert will be able to classify a
+            pooled-object feature as possessing.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
@@ -69,25 +73,21 @@ class LxmertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        l_layers (`int`, *optional*, defaults to 9):
+            Number of hidden layers in the Transformer language encoder.
+        x_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer cross modality encoder.
+        r_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer visual encoder.
         visual_feat_dim (`int`, *optional*, defaults to 2048):
             This represents the last dimension of the pooled-object features used as input for the model, representing
             the size of each object feature itself.
         visual_pos_dim (`int`, *optional*, defaults to 4):
             This represents the number of spacial features that are mixed into the visual features. The default is set
             to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
-        visual_loss_normalizer (`float`, *optional*, defaults to 1/15):
+        visual_loss_normalizer (`float`, *optional*, defaults to 6.67):
             This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
             decided to train with multiple vision-based loss objectives.
-        num_qa_labels (`int`, *optional*, defaults to 9500):
-            This represents the total number of different question answering (QA) labels there are. If using more than
-            one dataset with QA, the user will need to account for the total number of labels that all of the datasets
-            have in total.
-        num_object_labels (`int`, *optional*, defaults to 1600):
-            This represents the total number of semantically unique objects that lxmert will be able to classify a
-            pooled-object feature as belonging too.
-        num_attr_labels (`int`, *optional*, defaults to 400):
-            This represents the total number of semantically unique attributes that lxmert will be able to classify a
-            pooled-object feature as possessing.
         task_matched (`bool`, *optional*, defaults to `True`):
             This task is used for sentence-image matching. If the sentence correctly describes the image the label will
             be 1. If the sentence does not correctly describe the image, the label will be 0.
@@ -104,12 +104,6 @@ class LxmertConfig(PretrainedConfig):
             Whether or not to calculate the attribute-prediction loss objective
         visual_feat_loss (`bool`, *optional*, defaults to `True`):
             Whether or not to calculate the feature-regression loss objective
-        output_attentions (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the attentions from the vision, language, and cross-modality layers
-            should be returned.
-        output_hidden_states (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the hidden states from the vision, language, and cross-modality
-            layers should be returned.
     """
 
     model_type = "lxmert"
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 80264af24e5..af3591e192e 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -356,20 +356,17 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
             sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
             the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
             height / width, size)`.
-        max_size (`int`, *optional*, defaults to 1333):
-            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
-            set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+        size_divisor (`int`, *optional*, defaults to 32):
+            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+            Swin Transformer.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
             to `True`.
-        size_divisor (`int`, *optional*, defaults to 32):
-            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-            Swin Transformer.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the input to a certain `scale`.
-        rescale_factor (`float`, *optional*, defaults to 1/ 255):
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
             Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 30a043518ea..e071c45e0cc 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -358,20 +358,17 @@ class MaskFormerImageProcessor(BaseImageProcessor):
             sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
             the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
             height / width, size)`.
-        max_size (`int`, *optional*, defaults to 1333):
-            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
-            set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+        size_divisor (`int`, *optional*, defaults to 32):
+            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+            Swin Transformer.
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
             to `True`.
-        size_divisor (`int`, *optional*, defaults to 32):
-            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-            Swin Transformer.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the input to a certain `scale`.
-        rescale_factor (`float`, *optional*, defaults to 1/ 255):
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
             Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
diff --git a/src/transformers/models/mgp_str/configuration_mgp_str.py b/src/transformers/models/mgp_str/configuration_mgp_str.py
index e77248cd644..b553c6a0ff6 100644
--- a/src/transformers/models/mgp_str/configuration_mgp_str.py
+++ b/src/transformers/models/mgp_str/configuration_mgp_str.py
@@ -62,7 +62,7 @@ class MgpstrConfig(PretrainedConfig):
             Whether to add a bias to the queries, keys and values.
         distilled (`bool`, *optional*, defaults to `False`):
             Model includes a distillation token and head as in DeiT models.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         drop_rate (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder.
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 1313fb21457..6e18e2dd485 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -44,9 +44,9 @@ class MgpstrProcessor(ProcessorMixin):
     [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
 
     Args:
-        image_processor (`ViTImageProcessor`):
+        image_processor (`ViTImageProcessor`, *optional*):
             An instance of `ViTImageProcessor`. The image processor is a required input.
-        tokenizer ([`MgpstrTokenizer`]):
+        tokenizer ([`MgpstrTokenizer`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "char_tokenizer"]
diff --git a/src/transformers/models/mgp_str/tokenization_mgp_str.py b/src/transformers/models/mgp_str/tokenization_mgp_str.py
index e267491c861..7fe11061154 100644
--- a/src/transformers/models/mgp_str/tokenization_mgp_str.py
+++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -52,7 +52,7 @@ class MgpstrTokenizer(PreTrainedTokenizer):
             The beginning of sequence token.
         eos_token (`str`, *optional*, defaults to `"[s]"`):
             The end of sequence token.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, , defaults to `"[GO]"`):
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"[GO]"`):
             A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
             attention mechanisms or loss computation.
     """
diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
index 6e367874b76..2ee20cd2baf 100644
--- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -55,7 +55,7 @@ class MobileNetV1Config(PretrainedConfig):
             All layers will have at least this many channels.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
-        tf_padding (`bool`, `optional`, defaults to `True`):
+        tf_padding (`bool`, *optional*, defaults to `True`):
             Whether to use TensorFlow padding rules on the convolution layers.
         classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
             The dropout ratio for attached classifiers.
diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
index 73003c9ded9..ab4eef23cfb 100644
--- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
@@ -64,16 +64,16 @@ class MobileNetV2Config(PretrainedConfig):
             the input dimensions by a factor of 32. If `output_stride` is 8 or 16, the model uses dilated convolutions
             on the depthwise layers instead of regular convolutions, so that the feature maps never become more than 8x
             or 16x smaller than the input image.
-        first_layer_is_expansion (`bool`, `optional`, defaults to `True`):
+        first_layer_is_expansion (`bool`, *optional*, defaults to `True`):
             True if the very first convolution layer is also the expansion layer for the first expansion block.
-        finegrained_output (`bool`, `optional`, defaults to `True`):
+        finegrained_output (`bool`, *optional*, defaults to `True`):
             If true, the number of output channels in the final convolution layer will stay large (1280) even if
             `depth_multiplier` is less than 1.
         hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
-        tf_padding (`bool`, `optional`, defaults to `True`):
+        tf_padding (`bool`, *optional*, defaults to `True`):
             Whether to use TensorFlow padding rules on the convolution layers.
-        classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.8):
             The dropout ratio for attached classifiers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -105,7 +105,7 @@ class MobileNetV2Config(PretrainedConfig):
         depth_multiplier=1.0,
         depth_divisible_by=8,
         min_depth=8,
-        expand_ratio=6,
+        expand_ratio=6.0,
         output_stride=32,
         first_layer_is_expansion=True,
         finegrained_output=True,
diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py
index fe782c39821..a4aafe997eb 100644
--- a/src/transformers/models/mobilevit/configuration_mobilevit.py
+++ b/src/transformers/models/mobilevit/configuration_mobilevit.py
@@ -74,7 +74,7 @@ class MobileViTConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
         conv_kernel_size (`int`, *optional*, defaults to 3):
             The size of the convolutional kernel in the MobileViT layer.
-        output_stride (`int`, `optional`, defaults to 32):
+        output_stride (`int`, *optional*, defaults to 32):
             The ratio of the spatial resolution of the output to the resolution of the input image.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the Transformer encoder.
@@ -84,11 +84,11 @@ class MobileViTConfig(PretrainedConfig):
             The dropout ratio for attached classifiers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        aspp_out_channels (`int`, `optional`, defaults to 256):
+        aspp_out_channels (`int`, *optional*, defaults to 256):
             Number of output channels used in the ASPP layer for semantic segmentation.
         atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
             Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 6fab1491ad3..0f3a422b30a 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -59,7 +59,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
             Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
             `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
             in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
index d98d88647e2..0181d17c351 100644
--- a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
@@ -54,15 +54,15 @@ class MobileViTV2Config(PretrainedConfig):
             The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
         conv_kernel_size (`int`, *optional*, defaults to 3):
             The size of the convolutional kernel in the MobileViTV2 layer.
-        output_stride (`int`, `optional`, defaults to 32):
+        output_stride (`int`, *optional*, defaults to 32):
             The ratio of the spatial resolution of the output to the resolution of the input image.
         classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for attached classifiers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        aspp_out_channels (`int`, `optional`, defaults to 512):
+        aspp_out_channels (`int`, *optional*, defaults to 512):
             Number of output channels used in the ASPP layer for semantic segmentation.
         atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
             Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
@@ -74,13 +74,13 @@ class MobileViTV2Config(PretrainedConfig):
             The number of attention blocks in each MobileViTV2Layer
         base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`):
             The base multiplier for dimensions of attention blocks in each MobileViTV2Layer
-        width_multiplier (`float`, *optional*, defaults to 1.0)
+        width_multiplier (`float`, *optional*, defaults to 1.0):
             The width multiplier for MobileViTV2.
-        ffn_multiplier (`int`, *optional*, defaults to 2)
+        ffn_multiplier (`int`, *optional*, defaults to 2):
             The FFN multiplier for MobileViTV2.
-        attn_dropout (`float`, *optional*, defaults to 0.0)
+        attn_dropout (`float`, *optional*, defaults to 0.0):
             The dropout in the attention layer.
-        ffn_dropout (`float`, *optional*, defaults to 0.0)
+        ffn_dropout (`float`, *optional*, defaults to 0.0):
             The dropout between FFN layers.
 
     Example:
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index f1a140aacbc..cc91966b6b0 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -145,17 +145,17 @@ class MptConfig(PretrainedConfig):
             the `inputs_ids` passed when calling [`MptModel`]. Check [this
             discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
             `vocab_size` has been defined.
-        resid_pdrop (`float`, *optional*, defaults to 0.1):
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
             The dropout probability applied to the attention output before combining with residual.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers.
-        emb_pdrop (`float`, *optional*, defaults to 0.1):
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
             The dropout probability for the embedding layer.
-        learned_pos_emb (`bool`, *optional*, defaults to `False`):
+        learned_pos_emb (`bool`, *optional*, defaults to `True`):
             Whether to use learned positional embeddings.
         attn_config (`dict`, *optional*):
             A dictionary used to configure the model's attention module.
-        init_device (`str`, *optional*):
+        init_device (`str`, *optional*, defaults to `"cpu"`):
             The device to use for parameter initialization. Defined for backward compatibility
         logit_scale (`float`, *optional*):
             If not None, scale the logits by this value.
@@ -169,7 +169,7 @@ class MptConfig(PretrainedConfig):
         norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
             Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
             compatibility.
-        use_cache (`bool`, *optional*, defaults to `True`):
+        use_cache (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the last key/values attentions (not used by all models).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py
index 5d8bd6b3c6e..e24ad679995 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -44,9 +44,9 @@ class NatConfig(BackboneConfigMixin, PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 64):
             Dimensionality of patch embedding.
-        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
             Number of layers in each level of the encoder.
-        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+        num_heads (`List[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
             Number of attention heads in each layer of the Transformer encoder.
         kernel_size (`int`, *optional*, defaults to 7):
             Neighborhood Attention kernel size.
@@ -65,7 +65,7 @@ class NatConfig(BackboneConfigMixin, PretrainedConfig):
             `"selu"` and `"gelu_new"` are supported.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         layer_scale_init_value (`float`, *optional*, defaults to 0.0):
             The initial value for the layer scale. Disabled if <=0.
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index e1175119535..882614059f9 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -66,7 +66,7 @@ class NougatImageProcessor(BaseImageProcessor):
             `do_resize` in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"height": 896, "width": 672}`):
             Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_thumbnail (`bool`, *optional*, defaults to `True`):
             Whether to resize the image using thumbnail method.
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py
index 977410f7a80..9d95940875e 100644
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@@ -383,10 +383,10 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
     methods for postprocessing the generated text.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        tokenizer_file (`str`):
+        tokenizer_file (`str`, *optional*):
             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
             contains everything needed to load the tokenizer.
 
@@ -394,16 +394,16 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
             Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
             spaces.
 
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+
         bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
         eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
     """
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index c990deed234..06c75b92b1c 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -42,87 +42,87 @@ class OneFormerConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`)
+        backbone_config (`PretrainedConfig`, *optional*, defaults to `SwinConfig`):
             The configuration of the backbone model.
-        ignore_value (`int`, *optional*, defaults to 255)
+        ignore_value (`int`, *optional*, defaults to 255):
             Values to be ignored in GT label while calculating loss.
-        num_queries (`int`, *optional*, defaults to 150)
+        num_queries (`int`, *optional*, defaults to 150):
             Number of object queries.
-        no_object_weight (`float`, *optional*, defaults to 0.1)
+        no_object_weight (`float`, *optional*, defaults to 0.1):
             Weight for no-object class predictions.
-        class_weight (`float`, *optional*, defaults to 2.0)
+        class_weight (`float`, *optional*, defaults to 2.0):
             Weight for Classification CE loss.
-        mask_weight (`float`, *optional*, defaults to 5.0)
+        mask_weight (`float`, *optional*, defaults to 5.0):
             Weight for binary CE loss.
-        dice_weight (`float`, *optional*, defaults to 5.0)
+        dice_weight (`float`, *optional*, defaults to 5.0):
             Weight for dice loss.
-        contrastive_weight (`float`, *optional*, defaults to 0.5)
+        contrastive_weight (`float`, *optional*, defaults to 0.5):
             Weight for contrastive loss.
-        contrastive_temperature (`float`, *optional*, defaults to 0.07)
+        contrastive_temperature (`float`, *optional*, defaults to 0.07):
             Initial value for scaling the contrastive logits.
-        train_num_points (`int`, *optional*, defaults to 12544)
+        train_num_points (`int`, *optional*, defaults to 12544):
             Number of points to sample while calculating losses on mask predictions.
-        oversample_ratio (`float`, *optional*, defaults to 3.0)
+        oversample_ratio (`float`, *optional*, defaults to 3.0):
             Ratio to decide how many points to oversample.
-        importance_sample_ratio (`float`, *optional*, defaults to 0.75)
+        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
             Ratio of points that are sampled via importance sampling.
-        init_std (`float`, *optional*, defaults to 0.02)
+        init_std (`float`, *optional*, defaults to 0.02):
             Standard deviation for normal intialization.
-        init_xavier_std (`float`, *optional*, defaults to 0.02)
+        init_xavier_std (`float`, *optional*, defaults to 1.0):
             Standard deviation for xavier uniform initialization.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05)
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             Epsilon for layer normalization.
-        is_training (`bool`, *optional*, defaults to False)
+        is_training (`bool`, *optional*, defaults to `False`):
             Whether to run in training or inference mode.
-        use_auxiliary_loss (`bool`, *optional*, defaults to True)
+        use_auxiliary_loss (`bool`, *optional*, defaults to `True`):
             Whether to calculate loss using intermediate predictions from transformer decoder.
-        output_auxiliary_logits (`bool`, *optional*, defaults to True)
+        output_auxiliary_logits (`bool`, *optional*, defaults to `True`):
             Whether to return intermediate predictions from transformer decoder.
-        strides (`list`, *optional*, defaults to [4, 8, 16, 32])
+        strides (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
             List containing the strides for feature maps in the encoder.
-        task_seq_len (`int`, *optional*, defaults to 77)
+        task_seq_len (`int`, *optional*, defaults to 77):
             Sequence length for tokenizing text list input.
-        text_encoder_width (`int`, *optional*, defaults to 256)
+        text_encoder_width (`int`, *optional*, defaults to 256):
             Hidden size for text encoder.
         text_encoder_context_length (`int`, *optional*, defaults to 77):
             Input sequence length for text encoder.
-        text_encoder_num_layers (`int`, *optional*, defaults to 6)
+        text_encoder_num_layers (`int`, *optional*, defaults to 6):
             Number of layers for transformer in text encoder.
-        text_encoder_vocab_size (`int`, *optional*, defaults to 49408)
+        text_encoder_vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size for tokenizer.
-        text_encoder_proj_layers (`int`, *optional*, defaults to 2)
+        text_encoder_proj_layers (`int`, *optional*, defaults to 2):
             Number of layers in MLP for project text queries.
-        text_encoder_n_ctx (`int`, *optional*, defaults to 16)
+        text_encoder_n_ctx (`int`, *optional*, defaults to 16):
             Number of learnable text context queries.
-        conv_dim (`int`, *optional*, defaults to 256)
+        conv_dim (`int`, *optional*, defaults to 256):
             Feature map dimension to map outputs from the backbone.
-        mask_dim (`int`, *optional*, defaults to 256)
+        mask_dim (`int`, *optional*, defaults to 256):
             Dimension for feature maps in pixel decoder.
-        hidden_dim (`int`, *optional*, defaults to 256)
+        hidden_dim (`int`, *optional*, defaults to 256):
             Dimension for hidden states in transformer decoder.
-        encoder_feedforward_dim (`int`, *optional*, defaults to 1024)
+        encoder_feedforward_dim (`int`, *optional*, defaults to 1024):
             Dimension for FFN layer in pixel decoder.
-        norm (`str`, *optional*, defaults to `GN`)
+        norm (`str`, *optional*, defaults to `"GN"`):
             Type of normalization.
-        encoder_layers (`int`, *optional*, defaults to 6)
+        encoder_layers (`int`, *optional*, defaults to 6):
             Number of layers in pixel decoder.
-        decoder_layers (`int`, *optional*, defaults to 10)
+        decoder_layers (`int`, *optional*, defaults to 10):
             Number of layers in transformer decoder.
-        use_task_norm (`bool`, *optional*, defaults to `True`)
+        use_task_norm (`bool`, *optional*, defaults to `True`):
             Whether to normalize the task token.
-        num_attention_heads (`int`, *optional*, defaults to 8)
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads in transformer layers in the pixel and transformer decoders.
-        dropout (`float`, *optional*, defaults to 0.1)
+        dropout (`float`, *optional*, defaults to 0.1):
             Dropout probability for pixel and transformer decoders.
-        dim_feedforward (`int`, *optional*, defaults to 2048)
+        dim_feedforward (`int`, *optional*, defaults to 2048):
             Dimension for FFN layer in transformer decoder.
-        pre_norm (`bool`, *optional*, defaults to `False`)
+        pre_norm (`bool`, *optional*, defaults to `False`):
             Whether to normalize hidden states before attention layers in transformer decoder.
-        enforce_input_proj (`bool`, *optional*, defaults to `False`)
+        enforce_input_proj (`bool`, *optional*, defaults to `False`):
             Whether to project hidden states in transformer decoder.
-        query_dec_layers (`int`, *optional*, defaults to 2)
+        query_dec_layers (`int`, *optional*, defaults to 2):
             Number of layers in query transformer.
-        common_stride (`int`, *optional*, defaults to 4)
+        common_stride (`int`, *optional*, defaults to 4):
             Common stride used for features in pixel decoder.
 
     Examples:
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 2e66efe61bc..16f5013f154 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -361,17 +361,14 @@ class OneFormerImageProcessor(BaseImageProcessor):
             sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
             the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
             height / width, size)`.
-        max_size (`int`, *optional*, defaults to 1333):
-            The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is
-            set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
             to `True`.
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the input to a certain `scale`.
-        rescale_factor (`float`, *optional*, defaults to 1/ 255):
+        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
             Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
@@ -387,9 +384,9 @@ class OneFormerImageProcessor(BaseImageProcessor):
             Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
             is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
             The background label will be replaced by `ignore_index`.
-        repo_path (`str`, defaults to `shi-labs/oneformer_demo`):
+        repo_path (`str`, defaults to `shi-labs/oneformer_demo`, *optional*, defaults to `"shi-labs/oneformer_demo"`):
             Dataset repository on huggingface hub containing the JSON file with class information for the dataset.
-        class_info_file (`str`):
+        class_info_file (`str`, *optional*):
             JSON file containing class information for the dataset. It is stored inside on the `repo_path` dataset
             repository.
         num_text (`int`, *optional*):
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index df16be211c5..dd6f349249e 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -56,7 +56,7 @@ class OpenAIGPTConfig(PretrainedConfig):
             The dropout ratio for the embeddings.
         attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -91,8 +91,6 @@ class OpenAIGPTConfig(PretrainedConfig):
             [`OpenAIGPTDoubleHeadsModel`].
 
             The dropout ratio to be used after the projection and activation.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
 
 
     Examples:
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index a03120a46e5..d21dc77bbf6 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -171,13 +171,13 @@ class OwlViTVisionConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 584e575603b..3efbc512296 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -102,7 +102,7 @@ class OwlViTImageProcessor(BaseImageProcessor):
             The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
             sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
             to (size, size).
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
+        resample (`int`, *optional*, defaults to `Resampling.BICUBIC`):
             An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
             `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
             `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 35472c367e2..088693a057f 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -33,9 +33,9 @@ class OwlViTProcessor(ProcessorMixin):
     [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`OwlViTImageProcessor`]):
+        image_processor ([`OwlViTImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index 182e92b8a31..72b13a11e11 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -65,7 +65,7 @@ class PerceiverConfig(PretrainedConfig):
         v_channels (`int`, *optional*):
             Dimension to project the values before applying attention in the cross-attention and self-attention layers
             of the encoder. Will default to preserving the dimension of the queries if not specified.
-        cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
+        cross_attention_shape_for_attention (`str`, *optional*, defaults to `"kv"`):
             Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
         self_attention_widening_factor (`int`, *optional*, defaults to 1):
             Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
@@ -89,7 +89,7 @@ class PerceiverConfig(PretrainedConfig):
             this to something large just in case (e.g., 512 or 1024 or 2048).
         image_size (`int`, *optional*, defaults to 56):
             Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
-        train_size (`List[int]`, *optional*, defaults to [368, 496]):
+        train_size (`List[int]`, *optional*, defaults to `[368, 496]`):
             Training size of the images for the optical flow model.
         num_frames (`int`, *optional*, defaults to 16):
             Number of video frames used for the multimodal autoencoding model.
@@ -97,11 +97,11 @@ class PerceiverConfig(PretrainedConfig):
             Number of audio samples per frame for the multimodal autoencoding model.
         samples_per_patch (`int`, *optional*, defaults to 16):
             Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
-        output_num_channels (`int`, *optional*, defaults to 512):
-            Number of output channels for each modalitiy decoder.
         output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
             Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
             autoencoding model. This excludes the channel dimension.
+        output_num_channels (`int`, *optional*, defaults to 512):
+            Number of output channels for each modalitiy decoder.
 
     Example:
 
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index 23268c217b3..feb5397a2a0 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -195,7 +195,7 @@ class Pix2StructVisionConfig(PretrainedConfig):
         dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         dropout_rate (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
@@ -203,7 +203,7 @@ class Pix2StructVisionConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 1e-10):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
         seq_len (`int`, *optional*, defaults to 4096):
diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py
index 550c387adce..7444de8ec2b 100644
--- a/src/transformers/models/poolformer/configuration_poolformer.py
+++ b/src/transformers/models/poolformer/configuration_poolformer.py
@@ -71,7 +71,7 @@ class PoolFormerConfig(PretrainedConfig):
             The activation function for the hidden layers.
         use_layer_scale (`bool`, *optional*, defaults to `True`):
             Whether to use layer scale.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-05):
             The initial value for the layer scale.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The initializer range for the weights.
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index ca2997dacf1..b5773d3146f 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -68,10 +68,10 @@ class PoolFormerImageProcessor(BaseImageProcessor):
               whilst maintaining the aspect ratio.
             - size is `{"shortest_edge": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
               whilst maintaining the aspect ratio.
-        crop_pct (`float`, *optional*, defaults to `0.9`):
+        crop_pct (`float`, *optional*, defaults to 0.9):
             Percentage of the image to crop from the center. Can be overridden by `crop_pct` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
@@ -80,12 +80,12 @@ class PoolFormerImageProcessor(BaseImageProcessor):
         crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the image after applying center crop. Only has an effect if `do_center_crop` is set to `True`. Can
             be overridden by the `crop_size` parameter in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
             `preprocess` method.
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index bb4fa5ff9ca..483188ca55d 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -313,9 +313,6 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
             used to separate bullet-point like sentences in summarization, *e.g.*.
         pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
diff --git a/src/transformers/models/pvt/configuration_pvt.py b/src/transformers/models/pvt/configuration_pvt.py
index c1dc5fd3b02..12fb3a5b9a9 100644
--- a/src/transformers/models/pvt/configuration_pvt.py
+++ b/src/transformers/models/pvt/configuration_pvt.py
@@ -49,7 +49,7 @@ class PvtConfig(PretrainedConfig):
             The input image size
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        num_encoder_blocks (`[int]`, *optional*., defaults to 4):
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
             The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
         depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
             The number of layers in each encoder block.
@@ -77,11 +77,11 @@ class PvtConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether or not a learnable bias should be added to the queries, keys and values.
-        num_labels ('int', *optional*, defaults to 1000)
+        num_labels ('int', *optional*, defaults to 1000):
             The number of classes.
     Example:
 
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
index d5fdbddf6be..37d65778b07 100644
--- a/src/transformers/models/pvt/image_processing_pvt.py
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -49,7 +49,7 @@ class PvtImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -58,7 +58,7 @@ class PvtImageProcessor(BaseImageProcessor):
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
-        do_normalize (`bool`, *optional*, defaults to `True):
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index 255e153c0d7..364a2d42edf 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -69,9 +69,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (`List[str]`, *optional*):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `[]`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
index b184788c4e9..2eb75e122e6 100644
--- a/src/transformers/models/sam/configuration_sam.py
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -104,7 +104,7 @@ class SamMaskDecoderConfig(PretrainedConfig):
             The number of layers in the IoU head module.
         iou_head_hidden_dim (`int`, *optional*, defaults to 256):
             The dimensionality of the hidden states in the IoU head module.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
 
     """
@@ -163,7 +163,7 @@ class SamVisionConfig(PretrainedConfig):
             Size of the patches to be extracted from the input image.
         hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string)
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
@@ -173,9 +173,9 @@ class SamVisionConfig(PretrainedConfig):
             Whether to add a bias to query, key, value projections.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of mlp hidden dim to embedding dim.
-        use_abs_pos (`bool`, *optional*, defaults to True):
+        use_abs_pos (`bool`, *optional*, defaults to `True`):
             Whether to use absolute position embedding.
-        use_rel_pos (`bool`, *optional*, defaults to True):
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
             Whether to use relative position embedding.
         window_size (`int`, *optional*, defaults to 14):
             Window size for relative position.
@@ -183,7 +183,7 @@ class SamVisionConfig(PretrainedConfig):
             The indexes of the global attention layers.
         num_pos_feats (`int`, *optional*, defaults to 128):
             The dimensionality of the position embedding.
-        mlp_dim (`int`, *optional*, defaults to None):
+        mlp_dim (`int`, *optional*):
             The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
             hidden_size`.
     """
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index d98e8164908..a5c5c1e5fb4 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -73,7 +73,7 @@ class SamImageProcessor(BaseImageProcessor):
             Size of the output image after resizing. Resizes the longest edge of the image to match
             `size["longest_edge"]` while maintaining the aspect ratio. Can be overridden by the `size` parameter in the
             `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index 44a835c4b06..7f95657e197 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -51,19 +51,19 @@ class SegformerConfig(PretrainedConfig):
             The number of input channels.
         num_encoder_blocks (`int`, *optional*, defaults to 4):
             The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
-        depths (`List[int]`, *optional*, defaults to [2, 2, 2, 2]):
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
             The number of layers in each encoder block.
-        sr_ratios (`List[int]`, *optional*, defaults to [8, 4, 2, 1]):
+        sr_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
             Sequence reduction ratios in each encoder block.
-        hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
+        hidden_sizes (`List[int]`, *optional*, defaults to `[32, 64, 160, 256]`):
             Dimension of each of the encoder blocks.
-        patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+        patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
             Patch size before each encoder block.
-        strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
             Stride before each encoder block.
-        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 5, 8]):
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
-        mlp_ratios (`List[int]`, *optional*, defaults to [4, 4, 4, 4]):
+        mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
             encoder blocks.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
@@ -79,7 +79,7 @@ class SegformerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         drop_path_rate (`float`, *optional*, defaults to 0.1):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         decoder_hidden_size (`int`, *optional*, defaults to 256):
             The dimension of the all-MLP decode head.
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index fd48b53f2b9..27687fde03f 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -57,7 +57,7 @@ class SegformerImageProcessor(BaseImageProcessor):
         size (`Dict[str, int]` *optional*, defaults to `{"height": 512, "width": 512}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index 81f2ea4e99b..0d5b077c938 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -41,13 +41,13 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
     mean and variance normalization to the extracted features.
 
     Args:
-        feature_size (`int`, defaults to 80):
+        feature_size (`int`, *optional*, defaults to 80):
             The feature dimension of the extracted features.
-        sampling_rate (`int`, defaults to 16000):
+        sampling_rate (`int`, *optional*, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        num_mel_bins (`int`, defaults to 80):
+        num_mel_bins (`int`, *optional*, defaults to 80):
             Number of Mel-frequency bins.
-        padding_value (`float`, defaults to 0.0):
+        padding_value (`float`, *optional*, defaults to 0.0):
             The value that is used to fill the padding vectors.
         do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
index a9a3e3ec54a..544dfeaf5d2 100644
--- a/src/transformers/models/speecht5/tokenization_speecht5.py
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -56,10 +56,10 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
         bos_token (`str`, *optional*, defaults to `"<s>"`):
             The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
diff --git a/src/transformers/models/swiftformer/configuration_swiftformer.py b/src/transformers/models/swiftformer/configuration_swiftformer.py
index 6c9ae7d4017..21dfe4cd8c5 100644
--- a/src/transformers/models/swiftformer/configuration_swiftformer.py
+++ b/src/transformers/models/swiftformer/configuration_swiftformer.py
@@ -51,7 +51,7 @@ class SwiftFormerConfig(PretrainedConfig):
             The embedding dimension at each stage
         mlp_ratio (`int`, *optional*, defaults to 4):
             Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
-        downsamples (`List[bool]`, *optional*, defaults to `[True, True, True, True]`)
+        downsamples (`List[bool]`, *optional*, defaults to `[True, True, True, True]`):
             Whether or not to downsample inputs between two stages.
         hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (string). `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
@@ -61,13 +61,13 @@ class SwiftFormerConfig(PretrainedConfig):
             The stride of convolution kernels in downsampling layers.
         down_pad (`int`, *optional*, defaults to 1):
             Padding in downsampling layers.
-        drop_path_rate (`float`, *optional*, defaults to 0.):
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
             Rate at which to increase dropout probability in DropPath.
         use_layer_scale (`bool`, *optional*, defaults to `True`):
             Whether to scale outputs from token mixers.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-05):
             Factor by which outputs from token mixers are scaled.
-        batch_norm_eps (`float`, *optional*, defaults to 1e-5):
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch normalization layers.
 
 
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 757112f8ceb..1b0efca1c47 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -55,15 +55,15 @@ class SwinConfig(BackboneConfigMixin, PretrainedConfig):
             The number of input channels.
         embed_dim (`int`, *optional*, defaults to 96):
             Dimensionality of patch embedding.
-        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depth of each layer in the Transformer encoder.
-        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
             Number of attention heads in each layer of the Transformer encoder.
         window_size (`int`, *optional*, defaults to 7):
             Size of windows.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
             Ratio of MLP hidden dimensionality to embedding dimensionality.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether or not a learnable bias should be added to the queries, keys and values.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings and encoder.
@@ -74,13 +74,13 @@ class SwinConfig(BackboneConfigMixin, PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
             `"selu"` and `"gelu_new"` are supported.
-        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to add absolute position embeddings to the patch embeddings.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
         out_features (`List[str]`, *optional*):
             If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py
index c65274e0ae7..6a84ca66707 100644
--- a/src/transformers/models/swin2sr/configuration_swin2sr.py
+++ b/src/transformers/models/swin2sr/configuration_swin2sr.py
@@ -69,12 +69,12 @@ class Swin2SRConfig(PretrainedConfig):
             Whether or not to add absolute position embeddings to the patch embeddings.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         upscale (`int`, *optional*, defaults to 2):
             The upscale factor for the image. 2/3/4/8 for image super resolution, 1 for denoising and compress artifact
             reduction
-        img_range (`float`, *optional*, defaults to 1.):
+        img_range (`float`, *optional*, defaults to 1.0):
             The range of the values of the input image.
         resi_connection (`str`, *optional*, defaults to `"1conv"`):
             The convolutional block to use before the residual connection in each stage.
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index 96e5711465d..595d920c6b5 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -70,9 +70,9 @@ class Swinv2Config(PretrainedConfig):
             Whether or not to add absolute position embeddings to the patch embeddings.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py
index 77f2aa008c8..dcd7b2a518a 100644
--- a/src/transformers/models/timesformer/configuration_timesformer.py
+++ b/src/transformers/models/timesformer/configuration_timesformer.py
@@ -62,7 +62,7 @@ class TimesformerConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 138afbcf93e..91f3d78aae7 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -154,7 +154,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             token instead.
         eos_token (`str`, *optional*, defaults to `"<eos>"`):
             The end of sequence token.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<formula>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<formula>']`):
             A list of additional special tokens (for the HuggingFace functionality).
         language (`str`, *optional*, defaults to `"en"`):
             The language of this tokenizer (used for mose preprocessing).
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 1191c566841..6b7723a975b 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -30,9 +30,9 @@ class TrOCRProcessor(ProcessorMixin):
     more information.
 
     Args:
-        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`]):
+        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
             An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
-        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
+        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
             An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/tvlt/configuration_tvlt.py
index a475fe89ed8..013952dbb1b 100644
--- a/src/transformers/models/tvlt/configuration_tvlt.py
+++ b/src/transformers/models/tvlt/configuration_tvlt.py
@@ -69,7 +69,7 @@ class TvltConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py
index d5beba76bd6..7dc5e046313 100644
--- a/src/transformers/models/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@@ -41,14 +41,14 @@ class TvltFeatureExtractor(SequenceFeatureExtractor):
             Number of audio channels.
         patch_size (`List[int]` *optional*, defaults to `[16, 16]`):
             The patch size of audio patch embedding.
-        feature_size (`int`, defaults to 128):
+        feature_size (`int`, *optional*, defaults to 128):
             The frequency length of audio spectrogram.
-        sampling_rate (`int`, defaults to 44100):
+        sampling_rate (`int`, *optional*, defaults to 44100):
             The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
-        hop_length_to_sampling_rate (`int`, defaults to 86):
+        hop_length_to_sampling_rate (`int`, *optional*, defaults to 86):
             Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
             For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
-        n_fft (`int`, defaults to 2048):
+        n_fft (`int`, *optional*, defaults to 2048):
             Size of the Fourier transform.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index aa40dd3c796..6df708eec3e 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -71,7 +71,7 @@ class VideoMAEImageProcessor(BaseImageProcessor):
             Size of the output image after resizing. The shortest edge of the image will be resized to
             `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
             `size` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 8dc0d156cba..06aa1bc9b3d 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -132,7 +132,7 @@ class ViltImageProcessor(BaseImageProcessor):
         size_divisor (`int`, *optional*, defaults to 32):
             The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
             is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 520b3082686..e86aa34c099 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -32,9 +32,9 @@ class ViltProcessor(ProcessorMixin):
     docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
 
     Args:
-        image_processor (`ViltImageProcessor`):
+        image_processor (`ViltImageProcessor`, *optional*):
             An instance of [`ViltImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`):
+        tokenizer (`BertTokenizerFast`, *optional*):
             An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index cffc58865a3..5dab0f42dc7 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -34,10 +34,6 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        text_config (`dict`):
-            Dictionary of configuration options that defines text model config.
-        vision_config (`dict`):
-            Dictionary of configuration options that defines vison model config.
         projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index b6b22a3be05..e6449914680 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -32,9 +32,9 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
     information.
 
     Args:
-        image_processor ([`AutoImageProcessor`]):
+        image_processor ([`AutoImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
+        tokenizer ([`PreTrainedTokenizer`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index d87b673b753..cbf67a01093 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -63,15 +63,15 @@ class ViTConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
+        encoder_stride (`int`, *optional*, defaults to 16):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
 
     Example:
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 1b7b3c5fd4c..be806d94c4d 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -49,7 +49,7 @@ class ViTImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index f06cc6c5f81..5e5db3600d7 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -40,6 +40,8 @@ class ViTHybridConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone in a dictionary or the config object of the backbone.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -51,9 +53,9 @@ class ViTHybridConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -65,12 +67,10 @@ class ViTHybridConfig(PretrainedConfig):
             The size (resolution) of each patch.
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `None`):
-            The configuration of the backbone in a dictionary or the config object of the backbone.
         backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
             Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
 
     Example:
 
diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py
index 4c065421a9f..aed808d7325 100644
--- a/src/transformers/models/vit_mae/configuration_vit_mae.py
+++ b/src/transformers/models/vit_mae/configuration_vit_mae.py
@@ -65,7 +65,7 @@ class ViTMAEConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        decoder_num_attention_heads (`int`, *optional*, defaults to 12):
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the decoder.
         decoder_hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the decoder.
diff --git a/src/transformers/models/vitdet/configuration_vitdet.py b/src/transformers/models/vitdet/configuration_vitdet.py
index 80fb60ef454..45dc9e9296f 100644
--- a/src/transformers/models/vitdet/configuration_vitdet.py
+++ b/src/transformers/models/vitdet/configuration_vitdet.py
@@ -53,7 +53,7 @@ class VitDetConfig(BackboneConfigMixin, PretrainedConfig):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the layer normalization layers.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
@@ -67,9 +67,9 @@ class VitDetConfig(BackboneConfigMixin, PretrainedConfig):
             Whether to add a bias to the queries, keys and values.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             Stochastic depth rate.
-        window_block_indices (`List[int]`, *optional*):
+        window_block_indices (`List[int]`, *optional*, defaults to `[]`):
             List of indices of blocks that should have window attention instead of regular global self-attention.
-        residual_block_indices (`List[int]`, *optional*):
+        residual_block_indices (`List[int]`, *optional*, defaults to `[]`):
             List of indices of blocks that should have an extra residual block after the MLP.
         use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to add absolute position embeddings to the patch embeddings.
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index cbbe30d9c9e..aee3463dd90 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -44,7 +44,7 @@ class VitMatteConfig(PretrainedConfig):
             The configuration of the backbone model.
         hidden_size (`int`, *optional*, defaults to 384):
             The number of input channels of the decoder.
-        batch_norm_eps (`float`, *optional*, defaults to 1e-5):
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch norm layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
index a0bd940b80b..602b1fbefa8 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -46,7 +46,7 @@ class VitMatteImageProcessor(BaseImageProcessor):
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
         do_normalize (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/vits/configuration_vits.py b/src/transformers/models/vits/configuration_vits.py
index 689fe8a77c7..2cadd39792b 100644
--- a/src/transformers/models/vits/configuration_vits.py
+++ b/src/transformers/models/vits/configuration_vits.py
@@ -48,7 +48,7 @@ class VitsConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         window_size (`int`, *optional*, defaults to 4):
             Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
-        use_bias (`bool`, *optional*, defaults to `True`)
+        use_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias in the key, query, value projection layers in the Transformer encoder.
         ffn_dim (`int`, *optional*, defaults to 768):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
@@ -72,7 +72,7 @@ class VitsConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
             Whether to use the stochastic duration prediction module or the regular duration predictor.
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 4884180fe74..f32dd0d3aea 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -73,7 +73,7 @@ class VivitImageProcessor(BaseImageProcessor):
             Size of the output image after resizing. The shortest edge of the image will be resized to
             `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
             `size` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
             `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
@@ -85,7 +85,7 @@ class VivitImageProcessor(BaseImageProcessor):
         do_rescale (`bool`, *optional*, defaults to `True`):
             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
             parameter in the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to 1/127.5):
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/127.5`):
             Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
             in the `preprocess` method.
         offset (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 2e0aadab003..b83528a84a2 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -219,7 +219,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
     This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
     the superclass for more information regarding such methods.
 
-     Args:
+    Args:
         vocab_file (`str`):
             Path to the vocabulary file.
         merges_file (`str`):
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 534d7bb9f97..6e54c9e7876 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -30,9 +30,9 @@ class XCLIPProcessor(ProcessorMixin):
     [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`VideoMAEImageProcessor`]):
+        image_processor ([`VideoMAEImageProcessor`], *optional*):
             The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`]):
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
             The tokenizer is a required input.
     """
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index 9dd0144eafa..913d25b2b46 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -83,11 +83,6 @@ class XGLMTokenizer(PreTrainedTokenizer):
             token instead.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index c0ffdae1194..49d22934e07 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -571,7 +571,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<special1>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
             List of additional special tokens.
         lang2id (`Dict[str, int]`, *optional*):
             Dictionary mapping languages string identifiers to their IDs.
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index 9cc1ae5ca08..c024d5d16dc 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -67,7 +67,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
     Args:
         vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
             <Tip>
@@ -77,7 +77,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
 
             </Tip>
 
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token.
 
             <Tip>
@@ -87,23 +87,21 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
 
             </Tip>
 
-        sep_token (`str`, *optional*, defaults to `"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 299f4268e56..c014aa1eb5e 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -106,8 +106,6 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 0481fec346d..c3e44d2e3d9 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether to lowercase the input when tokenizing.
         remove_space (`bool`, *optional*, defaults to `True`):
             Whether to strip the text when tokenizing (removing excess spaces before and after the string).
@@ -102,7 +102,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<eop>', '<eod>']`):
             Additional special tokens used by the tokenizer.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py
index 538f85b1eb2..77a036f5adb 100644
--- a/src/transformers/models/yolos/configuration_yolos.py
+++ b/src/transformers/models/yolos/configuration_yolos.py
@@ -54,9 +54,9 @@ class YolosConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -64,13 +64,13 @@ class YolosConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         image_size (`List[int]`, *optional*, defaults to `[512, 864]`):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        num_detection_tokens (`int`, *optional*, defaults to `100`):
+        num_detection_tokens (`int`, *optional*, defaults to 100):
             The number of detection tokens.
         use_mid_position_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to use the mid-layer position encodings.
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 2b8f5d2a888..5734b6e9cd5 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -383,13 +383,13 @@ class AdamW(Optimizer):
     Parameters:
         params (`Iterable[nn.parameter.Parameter]`):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (`float`, *optional*, defaults to 1e-3):
+        lr (`float`, *optional*, defaults to 0.001):
             The learning rate to use.
-        betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
+        betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
             Adam's betas parameters (b1, b2).
-        eps (`float`, *optional*, defaults to 1e-6):
+        eps (`float`, *optional*, defaults to 1e-06):
             Adam's epsilon for numerical stability.
-        weight_decay (`float`, *optional*, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0.0):
             Decoupled weight decay to apply.
         correct_bias (`bool`, *optional*, defaults to `True`):
             Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
@@ -504,15 +504,15 @@ class Adafactor(Optimizer):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
         lr (`float`, *optional*):
             The external learning rate.
-        eps (`Tuple[float, float]`, *optional*, defaults to (1e-30, 1e-3)):
+        eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
             Regularization constants for square gradient and parameter scale respectively
-        clip_threshold (`float`, *optional*, defaults 1.0):
+        clip_threshold (`float`, *optional*, defaults to 1.0):
             Threshold of root mean square of final gradient update
         decay_rate (`float`, *optional*, defaults to -0.8):
             Coefficient used to compute running averages of square
         beta1 (`float`, *optional*):
             Coefficient used for computing running averages of gradient
-        weight_decay (`float`, *optional*, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0.0):
             Weight decay (L2 penalty)
         scale_parameter (`bool`, *optional*, defaults to `True`):
             If True, learning rate is scaled by root mean square
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index a9f9ec12074..a4a84b06f87 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -39,7 +39,7 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
             The schedule function to apply after the warmup for the rest of training.
         warmup_steps (`int`):
             The number of steps for the warmup part of training.
-        power (`float`, *optional*, defaults to 1):
+        power (`float`, *optional*, defaults to 1.0):
             The power to use for the polynomial warmup (defaults is a linear warmup).
         name (`str`, *optional*):
             Optional name prefix for the returned tensors during the schedule.
@@ -180,18 +180,18 @@ class AdamWeightDecay(Adam):
     to adding the square of the weights to the loss with plain (non-momentum) SGD.
 
     Args:
-        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 1e-3):
+        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 0.001):
             The learning rate to use or a schedule.
         beta_1 (`float`, *optional*, defaults to 0.9):
             The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
         beta_2 (`float`, *optional*, defaults to 0.999):
             The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
-        epsilon (`float`, *optional*, defaults to 1e-7):
+        epsilon (`float`, *optional*, defaults to 1e-07):
             The epsilon parameter in Adam, which is a small constant for numerical stability.
-        amsgrad (`bool`, *optional*, default to `False`):
+        amsgrad (`bool`, *optional*, defaults to `False`):
             Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
             Beyond](https://arxiv.org/abs/1904.09237).
-        weight_decay_rate (`float`, *optional*, defaults to 0):
+        weight_decay_rate (`float`, *optional*, defaults to 0.0):
             The weight decay to apply.
         include_in_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
@@ -199,7 +199,7 @@ class AdamWeightDecay(Adam):
         exclude_from_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
             `include_in_weight_decay` is passed, the names in it will supersede this list.
-        name (`str`, *optional*, defaults to 'AdamWeightDecay'):
+        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
             Optional name for the operations created when applying gradients.
         kwargs (`Dict[str, Any]`, *optional*):
             Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index da396b8fdc5..36c9585a69d 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -445,9 +445,9 @@ class PipelineDataFormat:
     pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -550,9 +550,9 @@ class CsvPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using CSV data format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -594,9 +594,9 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using JSON file format.
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
@@ -638,9 +638,9 @@ class PipedPipelineDataFormat(PipelineDataFormat):
     If columns are provided, then the output will be a dictionary with {column_x: value_x}
 
     Args:
-        output_path (`str`, *optional*): Where to save the outgoing data.
-        input_path (`str`, *optional*): Where to look for the input data.
-        column (`str`, *optional*): The column to read.
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the `output_path`.
     """
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 1f2cf6e436f..c2285ad4796 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -183,7 +183,7 @@ class BatchEncoding(UserDict):
     utility methods to map from word/character space to token space.
 
     Args:
-        data (`dict`):
+        data (`dict`, *optional*):
             Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
             ('input_ids', 'attention_mask', etc.).
         encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index bf2dd8f1605..f7ce384e926 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -348,7 +348,7 @@ class RemoteTool(Tool):
     A [`Tool`] that will make requests to an inference endpoint.
 
     Args:
-        endpoint_url (`str`):
+        endpoint_url (`str`, *optional*):
             The url of the endpoint to use.
         token (`str`, *optional*):
             The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 72b2cc049a1..416b10cec5c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -288,9 +288,9 @@ class Trainer:
             detailed in [here](callback).
 
             If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
-            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
-            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
             A function that preprocess the logits right before caching them at each evaluation step. Must take two
             tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 49b12ea558d..298b473850f 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -166,6 +166,7 @@ class TrainerControl:
 
 
 class TrainerCallback:
+    # no-format
     """
     A class for objects that will inspect the state of the training loop at some events and take some decisions. At
     each of those events the following arguments are available:
@@ -537,10 +538,10 @@ class EarlyStoppingCallback(TrainerCallback):
     A [`TrainerCallback`] that handles early stopping.
 
     Args:
-       early_stopping_patience (`int`):
+        early_stopping_patience (`int`):
             Use with `metric_for_best_model` to stop training when the specified metric worsens for
             `early_stopping_patience` evaluation calls.
-       early_stopping_threshold(`float`, *optional*):
+        early_stopping_threshold(`float`, *optional*):
             Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
             specified metric must improve to satisfy early stopping conditions. `
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 0fdfcfea620..e19bc854ff4 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -112,7 +112,7 @@ class EvalPrediction:
     Parameters:
         predictions (`np.ndarray`): Predictions of the model.
         label_ids (`np.ndarray`): Targets to be matched.
-        inputs (`np.ndarray`, *optional*)
+        inputs (`np.ndarray`, *optional*):
     """
 
     def __init__(
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 9b698947653..13f81a5a2cf 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -143,7 +143,7 @@ class BitsAndBytesConfig(QuantizationConfigMixin):
         load_in_4bit (`bool`, *optional*, defaults to `False`):
             This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
             `bitsandbytes`.
-        llm_int8_threshold (`float`, *optional*, defaults to 6):
+        llm_int8_threshold (`float`, *optional*, defaults to 6.0):
             This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
             Multiplication for Transformers at Scale` paper: https://arxiv.org/abs/2208.07339 Any hidden states value
             that is above this threshold will be considered an outlier and the operation on those values will be done
@@ -167,7 +167,7 @@ class BitsAndBytesConfig(QuantizationConfigMixin):
         bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
             This sets the computational type which might be different than the input time. For example, inputs might be
             fp32, but computation can be set to bf16 for speedups.
-        bnb_4bit_quant_type (`str`, {fp4, nf4}, defaults to `fp4`):
+        bnb_4bit_quant_type (`str`,  *optional*, defaults to `"fp4"`):
             This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
             which are specified by `fp4` or `nf4`.
         bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
@@ -346,7 +346,7 @@ class GPTQConfig(QuantizationConfigMixin):
             The pad token id. Needed to prepare the dataset when `batch_size` > 1.
         disable_exllama (`bool`, *optional*, defaults to `False`):
             Whether to use exllama backend. Only works with `bits` = 4.
-        max_input_length (`int`, *optional*)
+        max_input_length (`int`, *optional*):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
             length. It is specific to the exllama backend with act-order.
     """
diff --git a/tests/repo_utils/test_check_docstrings.py b/tests/repo_utils/test_check_docstrings.py
new file mode 100644
index 00000000000..f761514a084
--- /dev/null
+++ b/tests/repo_utils/test_check_docstrings.py
@@ -0,0 +1,98 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import sys
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+from check_docstrings import get_default_description, replace_default_in_arg_description  # noqa: E402
+
+
+class CheckDostringsTested(unittest.TestCase):
+    def test_replace_default_in_arg_description(self):
+        # Standard docstring with default.
+        desc_with_default = "`float`, *optional*, defaults to 2.0"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, inspect._empty), "`float`")
+
+        # Standard docstring with default but optional is not using the stars.
+        desc_with_default_typo = "`float`, `optional`, defaults to 2.0"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default_typo, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default_typo, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+
+        # If the default is None we do not erase the value in the docstring.
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*, defaults to 2.0"
+        )
+        # If the default is None (and set as such in the docstring), we do not include it.
+        desc_with_default = "`float`, *optional*, defaults to None"
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*")
+        desc_with_default = "`float`, *optional*, defaults to `None`"
+        self.assertEqual(replace_default_in_arg_description(desc_with_default, None), "`float`, *optional*")
+
+        # Operations are not replaced, but put in backtiks.
+        desc_with_default = "`float`, *optional*, defaults to 1/255"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1 / 255), "`float`, *optional*, defaults to `1/255`"
+        )
+        desc_with_default = "`float`, *optional*, defaults to `1/255`"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_default, 1 / 255), "`float`, *optional*, defaults to `1/255`"
+        )
+
+        desc_with_optional = "`float`, *optional*"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_optional, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_optional, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_optional, None), "`float`, *optional*")
+        self.assertEqual(replace_default_in_arg_description(desc_with_optional, inspect._empty), "`float`")
+
+        desc_with_no_optional = "`float`"
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_no_optional, 2.0), "`float`, *optional*, defaults to 2.0"
+        )
+        self.assertEqual(
+            replace_default_in_arg_description(desc_with_no_optional, 1.0), "`float`, *optional*, defaults to 1.0"
+        )
+        self.assertEqual(replace_default_in_arg_description(desc_with_no_optional, None), "`float`, *optional*")
+        self.assertEqual(replace_default_in_arg_description(desc_with_no_optional, inspect._empty), "`float`")
+
+    def test_get_default_description(self):
+        # Fake function to have arguments to test.
+        def _fake_function(a, b: int, c=1, d: float = 2.0, e: str = "blob"):
+            pass
+
+        params = inspect.signature(_fake_function).parameters
+        assert get_default_description(params["a"]) == "`<fill_type>`"
+        assert get_default_description(params["b"]) == "`int`"
+        assert get_default_description(params["c"]) == "`<fill_type>`, *optional*, defaults to 1"
+        assert get_default_description(params["d"]) == "`float`, *optional*, defaults to 2.0"
+        assert get_default_description(params["e"]) == '`str`, *optional*, defaults to `"blob"`'
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
new file mode 100644
index 00000000000..8d69ea603f1
--- /dev/null
+++ b/utils/check_docstrings.py
@@ -0,0 +1,1272 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility that checks all docstrings of public objects have an argument section matching their signature.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_docstrings.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+To auto-fix issues run:
+
+```bash
+python utils/check_docstrings.py --fix_and_overwrite
+```
+
+which is used by `make fix-copies` (note that this fills what it cans, you might have to manually fill information
+like argument descriptions).
+"""
+import argparse
+import ast
+import enum
+import inspect
+import operator as op
+import re
+from pathlib import Path
+from typing import Any, Optional, Tuple, Union
+
+from check_repo import ignore_undocumented
+
+from transformers.utils import direct_transformers_import
+
+
+PATH_TO_TRANSFORMERS = Path("src").resolve() / "transformers"
+
+# This is to make sure the transformers module imported is the one in the repo.
+transformers = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+OPTIONAL_KEYWORD = "*optional*"
+# Re pattern that catches args blocks in docstrings (with all variation around the name supported).
+_re_args = re.compile(r"^\s*(Args?|Arguments?|Attributes?|Params?|Parameters?):\s*$")
+# Re pattern that parses the start of an arg block: catches <name> (<description>) in those lines.
+_re_parse_arg = re.compile(r"^(\s*)(\S+)\s+\((.+)\)(?:\:|$)")
+# Re pattern that parses the end of a description of an arg (catches the default in *optional*, defaults to xxx).
+_re_parse_description = re.compile(r"\*optional\*, defaults to (.*)$")
+
+
+# This is a temporary list of objects to ignore while we progressively fix them. Do not add anything here, fix the
+# docstrings instead. If formatting should be ignored for the docstring, you can put a comment # no-format on the
+# line before the docstring.
+OBJECTS_TO_IGNORE = [
+    # Deprecated
+    "InputExample",
+    "InputFeatures",
+    # Signature is *args/**kwargs
+    # "PretrainedConfig", #ignored but could be fixed
+    # "GenerationConfig", #ignored but could be fixed
+    "TFSequenceSummary",
+    "TFBertTokenizer",
+    "TFGPT2Tokenizer",
+    # Missing arguments in the docstring
+    "ASTFeatureExtractor",
+    "AlbertConfig",
+    "AlbertModel",
+    "AlbertTokenizerFast",
+    "AlignTextModel",
+    "AlignVisionConfig",
+    "AltCLIPTextConfig",
+    "AltCLIPVisionConfig",
+    "AudioClassificationPipeline",
+    "AutoformerConfig",
+    "AutomaticSpeechRecognitionPipeline",
+    "AzureOpenAiAgent",
+    "BarkCoarseConfig",
+    "BarkConfig",
+    "BarkFineConfig",
+    "BarkSemanticConfig",
+    "BartConfig",
+    "BartTokenizerFast",
+    "BarthezTokenizerFast",
+    "BeitModel",
+    "BertConfig",
+    "BertGenerationConfig",
+    "BertGenerationTokenizer",
+    "BertJapaneseTokenizer",
+    "BertModel",
+    "BertTokenizerFast",
+    "BigBirdConfig",
+    "BigBirdForQuestionAnswering",
+    "BigBirdModel",
+    "BigBirdPegasusConfig",
+    "BigBirdTokenizerFast",
+    "BitImageProcessor",
+    "BlenderbotConfig",
+    "BlenderbotSmallConfig",
+    "BlenderbotSmallTokenizerFast",
+    "BlenderbotTokenizerFast",
+    "Blip2QFormerConfig",
+    "Blip2VisionConfig",
+    "BlipTextConfig",
+    "BlipVisionConfig",
+    "BloomConfig",
+    "BloomTokenizerFast",
+    "BridgeTowerTextConfig",
+    "BridgeTowerVisionConfig",
+    "BrosModel",
+    "CLIPImageProcessor",
+    "CLIPSegTextConfig",
+    "CLIPSegVisionConfig",
+    "CLIPTextConfig",
+    "CLIPTokenizer",
+    "CLIPTokenizerFast",
+    "CLIPVisionConfig",
+    "CamembertConfig",
+    "CamembertModel",
+    "CamembertTokenizerFast",
+    "CanineConfig",
+    "CanineModel",
+    "CanineTokenizer",
+    "ChineseCLIPImageProcessor",
+    "ChineseCLIPTextConfig",
+    "ChineseCLIPTextModel",
+    "ChineseCLIPVisionConfig",
+    "ClapTextConfig",
+    "CodeGenConfig",
+    "CodeGenTokenizer",
+    "CodeGenTokenizerFast",
+    "CodeLlamaTokenizer",
+    "CodeLlamaTokenizerFast",
+    "ConditionalDetrConfig",
+    "ConditionalDetrImageProcessor",
+    "ConvBertConfig",
+    "ConvBertTokenizerFast",
+    "ConvNextConfig",
+    "ConvNextV2Config",
+    "ConversationalPipeline",
+    "CpmAntTokenizer",
+    "CvtConfig",
+    "CvtModel",
+    "DeiTImageProcessor",
+    "DPRConfig",
+    "DPRReaderTokenizer",
+    "DPRReaderTokenizerFast",
+    "DPTModel",
+    "Data2VecAudioConfig",
+    "Data2VecTextConfig",
+    "Data2VecTextModel",
+    "Data2VecVisionModel",
+    "DataCollatorForLanguageModeling",
+    "DebertaConfig",
+    "DebertaV2Config",
+    "DebertaV2Tokenizer",
+    "DebertaV2TokenizerFast",
+    "DecisionTransformerConfig",
+    "DeformableDetrConfig",
+    "DeformableDetrImageProcessor",
+    "DeiTModel",
+    "DepthEstimationPipeline",
+    "DetaConfig",
+    "DetaImageProcessor",
+    "DetrConfig",
+    "DetrImageProcessor",
+    "DinatModel",
+    "DistilBertConfig",
+    "DistilBertTokenizerFast",
+    "DocumentQuestionAnsweringPipeline",
+    "DonutImageProcessor",
+    "DonutSwinModel",
+    "EarlyStoppingCallback",
+    "EfficientFormerConfig",
+    "EfficientFormerImageProcessor",
+    "EfficientNetConfig",
+    "ElectraConfig",
+    "ElectraTokenizerFast",
+    "EncoderDecoderModel",
+    "EncoderRepetitionPenaltyLogitsProcessor",
+    "ErnieConfig",
+    "ErnieMConfig",
+    "ErnieMModel",
+    "ErnieModel",
+    "ErnieMTokenizer",
+    "EsmConfig",
+    "EsmModel",
+    "FlaxAlbertForMaskedLM",
+    "FlaxAlbertForMultipleChoice",
+    "FlaxAlbertForPreTraining",
+    "FlaxAlbertForQuestionAnswering",
+    "FlaxAlbertForSequenceClassification",
+    "FlaxAlbertForTokenClassification",
+    "FlaxAlbertModel",
+    "FlaxBartForCausalLM",
+    "FlaxBartForConditionalGeneration",
+    "FlaxBartForQuestionAnswering",
+    "FlaxBartForSequenceClassification",
+    "FlaxBartModel",
+    "FlaxBeitForImageClassification",
+    "FlaxBeitForMaskedImageModeling",
+    "FlaxBeitModel",
+    "FlaxBertForCausalLM",
+    "FlaxBertForMaskedLM",
+    "FlaxBertForMultipleChoice",
+    "FlaxBertForNextSentencePrediction",
+    "FlaxBertForPreTraining",
+    "FlaxBertForQuestionAnswering",
+    "FlaxBertForSequenceClassification",
+    "FlaxBertForTokenClassification",
+    "FlaxBertModel",
+    "FlaxBigBirdForCausalLM",
+    "FlaxBigBirdForMaskedLM",
+    "FlaxBigBirdForMultipleChoice",
+    "FlaxBigBirdForPreTraining",
+    "FlaxBigBirdForQuestionAnswering",
+    "FlaxBigBirdForSequenceClassification",
+    "FlaxBigBirdForTokenClassification",
+    "FlaxBigBirdModel",
+    "FlaxBlenderbotForConditionalGeneration",
+    "FlaxBlenderbotModel",
+    "FlaxBlenderbotSmallForConditionalGeneration",
+    "FlaxBlenderbotSmallModel",
+    "FlaxBloomForCausalLM",
+    "FlaxBloomModel",
+    "FlaxCLIPModel",
+    "FlaxDistilBertForMaskedLM",
+    "FlaxDistilBertForMultipleChoice",
+    "FlaxDistilBertForQuestionAnswering",
+    "FlaxDistilBertForSequenceClassification",
+    "FlaxDistilBertForTokenClassification",
+    "FlaxDistilBertModel",
+    "FlaxElectraForCausalLM",
+    "FlaxElectraForMaskedLM",
+    "FlaxElectraForMultipleChoice",
+    "FlaxElectraForPreTraining",
+    "FlaxElectraForQuestionAnswering",
+    "FlaxElectraForSequenceClassification",
+    "FlaxElectraForTokenClassification",
+    "FlaxElectraModel",
+    "FlaxEncoderDecoderModel",
+    "FlaxGPT2LMHeadModel",
+    "FlaxGPT2Model",
+    "FlaxGPTJForCausalLM",
+    "FlaxGPTJModel",
+    "FlaxGPTNeoForCausalLM",
+    "FlaxGPTNeoModel",
+    "FlaxMBartForConditionalGeneration",
+    "FlaxMBartForQuestionAnswering",
+    "FlaxMBartForSequenceClassification",
+    "FlaxMBartModel",
+    "FlaxMarianMTModel",
+    "FlaxMarianModel",
+    "FlaxOPTForCausalLM",
+    "FlaxPegasusForConditionalGeneration",
+    "FlaxPegasusModel",
+    "FlaxRegNetForImageClassification",
+    "FlaxRegNetModel",
+    "FlaxResNetForImageClassification",
+    "FlaxResNetModel",
+    "FlaxRoFormerForMaskedLM",
+    "FlaxRoFormerForMultipleChoice",
+    "FlaxRoFormerForQuestionAnswering",
+    "FlaxRoFormerForSequenceClassification",
+    "FlaxRoFormerForTokenClassification",
+    "FlaxRoFormerModel",
+    "FlaxRobertaForCausalLM",
+    "FlaxRobertaForMaskedLM",
+    "FlaxRobertaForMultipleChoice",
+    "FlaxRobertaForQuestionAnswering",
+    "FlaxRobertaForSequenceClassification",
+    "FlaxRobertaForTokenClassification",
+    "FlaxRobertaModel",
+    "FlaxRobertaPreLayerNormForCausalLM",
+    "FlaxRobertaPreLayerNormForMaskedLM",
+    "FlaxRobertaPreLayerNormForMultipleChoice",
+    "FlaxRobertaPreLayerNormForQuestionAnswering",
+    "FlaxRobertaPreLayerNormForSequenceClassification",
+    "FlaxRobertaPreLayerNormForTokenClassification",
+    "FlaxRobertaPreLayerNormModel",
+    "FlaxSpeechEncoderDecoderModel",
+    "FlaxViTForImageClassification",
+    "FlaxViTModel",
+    "FlaxVisionEncoderDecoderModel",
+    "FlaxVisionTextDualEncoderModel",
+    "FlaxWav2Vec2ForCTC",
+    "FlaxWav2Vec2ForPreTraining",
+    "FlaxWav2Vec2Model",
+    "FlaxWhisperForAudioClassification",
+    "FlaxWhisperForConditionalGeneration",
+    "FlaxWhisperModel",
+    "FlaxWhisperTimeStampLogitsProcessor",
+    "FlaxXGLMForCausalLM",
+    "FlaxXGLMModel",
+    "FlaxXLMRobertaForCausalLM",
+    "FlaxXLMRobertaForMaskedLM",
+    "FlaxXLMRobertaForMultipleChoice",
+    "FlaxXLMRobertaForQuestionAnswering",
+    "FlaxXLMRobertaForSequenceClassification",
+    "FlaxXLMRobertaForTokenClassification",
+    "FlaxXLMRobertaModel",
+    "FNetConfig",
+    "FNetModel",
+    "FNetTokenizerFast",
+    "FSMTConfig",
+    "FeatureExtractionPipeline",
+    "FillMaskPipeline",
+    "FlaubertConfig",
+    "FlavaConfig",
+    "FlavaForPreTraining",
+    "FlavaImageModel",
+    "FlavaImageProcessor",
+    "FlavaMultimodalModel",
+    "FlavaTextConfig",
+    "FlavaTextModel",
+    "FocalNetModel",
+    "FunnelTokenizerFast",
+    "GPT2Config",
+    "GPT2Tokenizer",
+    "GPT2TokenizerFast",
+    "GPTBigCodeConfig",
+    "GPTJConfig",
+    "GPTNeoXConfig",
+    "GPTNeoXJapaneseConfig",
+    "GPTNeoXTokenizerFast",
+    "GPTSanJapaneseConfig",
+    "GitConfig",
+    "GitVisionConfig",
+    "GraphormerConfig",
+    "GroupViTTextConfig",
+    "GroupViTVisionConfig",
+    "HerbertTokenizerFast",
+    "HubertConfig",
+    "HubertForCTC",
+    "IBertConfig",
+    "IBertModel",
+    "IdeficsConfig",
+    "IdeficsProcessor",
+    "ImageClassificationPipeline",
+    "ImageGPTConfig",
+    "ImageSegmentationPipeline",
+    "ImageToImagePipeline",
+    "ImageToTextPipeline",
+    "InformerConfig",
+    "InstructBlipQFormerConfig",
+    "JukeboxPriorConfig",
+    "JukeboxTokenizer",
+    "LEDConfig",
+    "LEDTokenizerFast",
+    "LayoutLMForQuestionAnswering",
+    "LayoutLMTokenizerFast",
+    "LayoutLMv2Config",
+    "LayoutLMv2ForQuestionAnswering",
+    "LayoutLMv2TokenizerFast",
+    "LayoutLMv3Config",
+    "LayoutLMv3ImageProcessor",
+    "LayoutLMv3TokenizerFast",
+    "LayoutXLMTokenizerFast",
+    "LevitConfig",
+    "LiltConfig",
+    "LiltModel",
+    "LlamaConfig",
+    "LlamaTokenizer",
+    "LlamaTokenizerFast",
+    "LongT5Config",
+    "LongformerConfig",
+    "LongformerModel",
+    "LongformerTokenizerFast",
+    "LukeConfig",
+    "LukeModel",
+    "LukeTokenizer",
+    "LxmertTokenizerFast",
+    "M2M100Config",
+    "M2M100Tokenizer",
+    "MarkupLMProcessor",
+    "MBart50TokenizerFast",
+    "MBartConfig",
+    "MCTCTFeatureExtractor",
+    "MPNetConfig",
+    "MPNetModel",
+    "MPNetTokenizerFast",
+    "MT5Config",
+    "MT5TokenizerFast",
+    "MarianConfig",
+    "MarianTokenizer",
+    "MarkupLMConfig",
+    "MarkupLMModel",
+    "MarkupLMTokenizer",
+    "MarkupLMTokenizerFast",
+    "Mask2FormerConfig",
+    "MaskFormerConfig",
+    "MaxTimeCriteria",
+    "MegaConfig",
+    "MegaModel",
+    "MegatronBertConfig",
+    "MegatronBertForPreTraining",
+    "MegatronBertModel",
+    "MobileBertConfig",
+    "MobileBertModel",
+    "MobileBertTokenizerFast",
+    "MobileNetV1ImageProcessor",
+    "MobileNetV1Model",
+    "MobileNetV2ImageProcessor",
+    "MobileNetV2Model",
+    "MobileViTModel",
+    "MobileViTV2Model",
+    "MLukeTokenizer",
+    "MraConfig",
+    "MusicgenDecoderConfig",
+    "MusicgenForConditionalGeneration",
+    "MvpConfig",
+    "MvpTokenizerFast",
+    "MT5Tokenizer",
+    "NatModel",
+    "NerPipeline",
+    "NezhaConfig",
+    "NezhaModel",
+    "NllbMoeConfig",
+    "NllbTokenizer",
+    "NllbTokenizerFast",
+    "NystromformerConfig",
+    "OPTConfig",
+    "ObjectDetectionPipeline",
+    "OneFormerProcessor",
+    "OpenAIGPTTokenizerFast",
+    "OpenLlamaConfig",
+    "OwlViTConfig",
+    "OwlViTModel",
+    "OwlViTTextConfig",
+    "PLBartConfig",
+    "PegasusConfig",
+    "PegasusTokenizer",
+    "PegasusTokenizerFast",
+    "PegasusXConfig",
+    "PerceiverImageProcessor",
+    "PerceiverModel",
+    "PerceiverTokenizer",
+    "PersimmonConfig",
+    "Pipeline",
+    "Pix2StructConfig",
+    "Pix2StructTextConfig",
+    "PLBartTokenizer",
+    "Pop2PianoConfig",
+    "PreTrainedTokenizer",
+    "PreTrainedTokenizerBase",
+    "PreTrainedTokenizerFast",
+    "PrefixConstrainedLogitsProcessor",
+    "ProphetNetConfig",
+    "QDQBertConfig",
+    "QDQBertModel",
+    "QuestionAnsweringPipeline",
+    "RagConfig",
+    "RagModel",
+    "RagRetriever",
+    "RagSequenceForGeneration",
+    "RagTokenForGeneration",
+    "RealmConfig",
+    "RealmForOpenQA",
+    "RealmScorer",
+    "RealmTokenizerFast",
+    "ReformerConfig",
+    "ReformerTokenizerFast",
+    "RegNetConfig",
+    "RemBertConfig",
+    "RemBertModel",
+    "RemBertTokenizer",
+    "RemBertTokenizerFast",
+    "RepetitionPenaltyLogitsProcessor",
+    "RetriBertConfig",
+    "RetriBertTokenizerFast",
+    "RoCBertConfig",
+    "RoCBertModel",
+    "RoCBertTokenizer",
+    "RoFormerConfig",
+    "RobertaConfig",
+    "RobertaModel",
+    "RobertaPreLayerNormConfig",
+    "RobertaPreLayerNormModel",
+    "RobertaTokenizerFast",
+    "RwkvConfig",
+    "SEWConfig",
+    "SEWDConfig",
+    "SEWDForCTC",
+    "SEWForCTC",
+    "SamConfig",
+    "SamPromptEncoderConfig",
+    "Seq2SeqTrainingArguments",
+    "SpecialTokensMixin",
+    "Speech2Text2Config",
+    "Speech2Text2Tokenizer",
+    "Speech2TextConfig",
+    "Speech2TextTokenizer",
+    "SpeechEncoderDecoderModel",
+    "SpeechT5Config",
+    "SpeechT5Model",
+    "SplinterConfig",
+    "SplinterTokenizerFast",
+    "SqueezeBertTokenizerFast",
+    "SummarizationPipeline",
+    "Swin2SRImageProcessor",
+    "SwinModel",
+    "Swinv2Model",
+    "SwitchTransformersConfig",
+    "T5Config",
+    "T5Tokenizer",
+    "T5TokenizerFast",
+    "TableQuestionAnsweringPipeline",
+    "TableTransformerConfig",
+    "TapasConfig",
+    "TapasModel",
+    "TapasTokenizer",
+    "Text2TextGenerationPipeline",
+    "TextClassificationPipeline",
+    "TextGenerationPipeline",
+    "TFAlbertForMaskedLM",
+    "TFAlbertForMultipleChoice",
+    "TFAlbertForPreTraining",
+    "TFAlbertForQuestionAnswering",
+    "TFAlbertForSequenceClassification",
+    "TFAlbertForTokenClassification",
+    "TFAlbertModel",
+    "TFBartForConditionalGeneration",
+    "TFBartForSequenceClassification",
+    "TFBartModel",
+    "TFBertForMaskedLM",
+    "TFBertForMultipleChoice",
+    "TFBertForNextSentencePrediction",
+    "TFBertForPreTraining",
+    "TFBertForQuestionAnswering",
+    "TFBertForSequenceClassification",
+    "TFBertForTokenClassification",
+    "TFBertModel",
+    "TFBlenderbotForConditionalGeneration",
+    "TFBlenderbotModel",
+    "TFBlenderbotSmallForConditionalGeneration",
+    "TFBlenderbotSmallModel",
+    "TFBlipForConditionalGeneration",
+    "TFBlipForImageTextRetrieval",
+    "TFBlipForQuestionAnswering",
+    "TFCLIPModel",
+    "TFCTRLForSequenceClassification",
+    "TFCTRLLMHeadModel",
+    "TFCTRLModel",
+    "TFCamembertForCausalLM",
+    "TFCamembertForMaskedLM",
+    "TFCamembertForMultipleChoice",
+    "TFCamembertForQuestionAnswering",
+    "TFCamembertForSequenceClassification",
+    "TFCamembertForTokenClassification",
+    "TFCamembertModel",
+    "TFConvBertForMaskedLM",
+    "TFConvBertForMultipleChoice",
+    "TFConvBertForQuestionAnswering",
+    "TFConvBertForSequenceClassification",
+    "TFConvBertForTokenClassification",
+    "TFConvBertModel",
+    "TFConvNextForImageClassification",
+    "TFConvNextModel",
+    "TFCvtForImageClassification",
+    "TFCvtModel",
+    "TFDPRReader",
+    "TFData2VecVisionForImageClassification",
+    "TFData2VecVisionForSemanticSegmentation",
+    "TFData2VecVisionModel",
+    "TFDebertaForMaskedLM",
+    "TFDebertaForQuestionAnswering",
+    "TFDebertaForSequenceClassification",
+    "TFDebertaForTokenClassification",
+    "TFDebertaModel",
+    "TFDebertaV2ForMaskedLM",
+    "TFDebertaV2ForMultipleChoice",
+    "TFDebertaV2ForQuestionAnswering",
+    "TFDebertaV2ForSequenceClassification",
+    "TFDebertaV2ForTokenClassification",
+    "TFDebertaV2Model",
+    "TFDeiTForImageClassification",
+    "TFDeiTForImageClassificationWithTeacher",
+    "TFDeiTForMaskedImageModeling",
+    "TFDeiTModel",
+    "TFDistilBertForMaskedLM",
+    "TFDistilBertForMultipleChoice",
+    "TFDistilBertForQuestionAnswering",
+    "TFDistilBertForSequenceClassification",
+    "TFDistilBertForTokenClassification",
+    "TFDistilBertModel",
+    "TFEfficientFormerForImageClassification",
+    "TFEfficientFormerForImageClassificationWithTeacher",
+    "TFEfficientFormerModel",
+    "TFElectraForMaskedLM",
+    "TFElectraForMultipleChoice",
+    "TFElectraForPreTraining",
+    "TFElectraForQuestionAnswering",
+    "TFElectraForSequenceClassification",
+    "TFElectraForTokenClassification",
+    "TFElectraModel",
+    "TFEncoderDecoderModel",
+    "TFEsmForMaskedLM",
+    "TFEsmForSequenceClassification",
+    "TFEsmForTokenClassification",
+    "TFEsmModel",
+    "TFFlaubertForMultipleChoice",
+    "TFFlaubertForQuestionAnsweringSimple",
+    "TFFlaubertForSequenceClassification",
+    "TFFlaubertForTokenClassification",
+    "TFFlaubertModel",
+    "TFFlaubertWithLMHeadModel",
+    "TFFunnelBaseModel",
+    "TFFunnelForMaskedLM",
+    "TFFunnelForMultipleChoice",
+    "TFFunnelForPreTraining",
+    "TFFunnelForQuestionAnswering",
+    "TFFunnelForSequenceClassification",
+    "TFFunnelForTokenClassification",
+    "TFFunnelModel",
+    "TFGPT2DoubleHeadsModel",
+    "TFGPT2ForSequenceClassification",
+    "TFGPT2LMHeadModel",
+    "TFGPT2Model",
+    "TFGPTJForCausalLM",
+    "TFGPTJForQuestionAnswering",
+    "TFGPTJForSequenceClassification",
+    "TFGPTJModel",
+    "TFGroupViTModel",
+    "TFHubertForCTC",
+    "TFHubertModel",
+    "TFLEDForConditionalGeneration",
+    "TFLEDModel",
+    "TFLayoutLMForMaskedLM",
+    "TFLayoutLMForQuestionAnswering",
+    "TFLayoutLMForSequenceClassification",
+    "TFLayoutLMForTokenClassification",
+    "TFLayoutLMModel",
+    "TFLayoutLMv3ForQuestionAnswering",
+    "TFLayoutLMv3ForSequenceClassification",
+    "TFLayoutLMv3ForTokenClassification",
+    "TFLayoutLMv3Model",
+    "TFLongformerForMaskedLM",
+    "TFLongformerForMultipleChoice",
+    "TFLongformerForQuestionAnswering",
+    "TFLongformerForSequenceClassification",
+    "TFLongformerForTokenClassification",
+    "TFLongformerModel",
+    "TFLxmertForPreTraining",
+    "TFLxmertModel",
+    "TFMBartForConditionalGeneration",
+    "TFMBartModel",
+    "TFMPNetForMaskedLM",
+    "TFMPNetForMultipleChoice",
+    "TFMPNetForQuestionAnswering",
+    "TFMPNetForSequenceClassification",
+    "TFMPNetForTokenClassification",
+    "TFMPNetModel",
+    "TFMarianMTModel",
+    "TFMarianModel",
+    "TFMobileBertForMaskedLM",
+    "TFMobileBertForMultipleChoice",
+    "TFMobileBertForNextSentencePrediction",
+    "TFMobileBertForPreTraining",
+    "TFMobileBertForQuestionAnswering",
+    "TFMobileBertForSequenceClassification",
+    "TFMobileBertForTokenClassification",
+    "TFMobileBertModel",
+    "TFMobileViTForImageClassification",
+    "TFMobileViTForSemanticSegmentation",
+    "TFMobileViTModel",
+    "TFOPTForCausalLM",
+    "TFOPTModel",
+    "TFOpenAIGPTDoubleHeadsModel",
+    "TFOpenAIGPTForSequenceClassification",
+    "TFOpenAIGPTLMHeadModel",
+    "TFOpenAIGPTModel",
+    "TFPegasusForConditionalGeneration",
+    "TFPegasusModel",
+    "TFRagModel",
+    "TFRagSequenceForGeneration",
+    "TFRagTokenForGeneration",
+    "TFRemBertForCausalLM",
+    "TFRemBertForMaskedLM",
+    "TFRemBertForMultipleChoice",
+    "TFRemBertForQuestionAnswering",
+    "TFRemBertForSequenceClassification",
+    "TFRemBertForTokenClassification",
+    "TFRemBertModel",
+    "TFRepetitionPenaltyLogitsProcessor",
+    "TFResNetForImageClassification",
+    "TFResNetModel",
+    "TFRoFormerForCausalLM",
+    "TFRoFormerForMaskedLM",
+    "TFRoFormerForMultipleChoice",
+    "TFRoFormerForQuestionAnswering",
+    "TFRoFormerForSequenceClassification",
+    "TFRoFormerForTokenClassification",
+    "TFRoFormerModel",
+    "TFRobertaForMaskedLM",
+    "TFRobertaForMultipleChoice",
+    "TFRobertaForQuestionAnswering",
+    "TFRobertaForSequenceClassification",
+    "TFRobertaForTokenClassification",
+    "TFRobertaModel",
+    "TFRobertaPreLayerNormForMaskedLM",
+    "TFRobertaPreLayerNormForMultipleChoice",
+    "TFRobertaPreLayerNormForQuestionAnswering",
+    "TFRobertaPreLayerNormForSequenceClassification",
+    "TFRobertaPreLayerNormForTokenClassification",
+    "TFRobertaPreLayerNormModel",
+    "TFSamModel",
+    "TFSegformerForImageClassification",
+    "TFSegformerForSemanticSegmentation",
+    "TFSegformerModel",
+    "TFSpeech2TextForConditionalGeneration",
+    "TFSpeech2TextModel",
+    "TFSwinForImageClassification",
+    "TFSwinForMaskedImageModeling",
+    "TFSwinModel",
+    "TFT5EncoderModel",
+    "TFT5ForConditionalGeneration",
+    "TFT5Model",
+    "TFTapasForMaskedLM",
+    "TFTapasForQuestionAnswering",
+    "TFTapasForSequenceClassification",
+    "TFTapasModel",
+    "TFTransfoXLForSequenceClassification",
+    "TFTransfoXLLMHeadModel",
+    "TFTransfoXLModel",
+    "TFViTForImageClassification",
+    "TFViTMAEForPreTraining",
+    "TFViTMAEModel",
+    "TFViTModel",
+    "TFVisionEncoderDecoderModel",
+    "TFVisionTextDualEncoderModel",
+    "TFWav2Vec2ForCTC",
+    "TFWav2Vec2Model",
+    "TFWhisperForConditionalGeneration",
+    "TFWhisperModel",
+    "TFXGLMForCausalLM",
+    "TFXGLMModel",
+    "TFXLMForMultipleChoice",
+    "TFXLMForQuestionAnsweringSimple",
+    "TFXLMForSequenceClassification",
+    "TFXLMForTokenClassification",
+    "TFXLMModel",
+    "TFXLMRobertaForCausalLM",
+    "TFXLMRobertaForMaskedLM",
+    "TFXLMRobertaForMultipleChoice",
+    "TFXLMRobertaForQuestionAnswering",
+    "TFXLMRobertaForSequenceClassification",
+    "TFXLMRobertaForTokenClassification",
+    "TFXLMRobertaModel",
+    "TFXLMWithLMHeadModel",
+    "TFXLNetForMultipleChoice",
+    "TFXLNetForQuestionAnsweringSimple",
+    "TFXLNetForSequenceClassification",
+    "TFXLNetForTokenClassification",
+    "TFXLNetLMHeadModel",
+    "TFXLNetModel",
+    "TimeSeriesTransformerConfig",
+    "TokenClassificationPipeline",
+    "TrOCRConfig",
+    "TrainerState",
+    "TrainingArguments",
+    "TrajectoryTransformerConfig",
+    "TransfoXLConfig",
+    "TranslationPipeline",
+    "TvltImageProcessor",
+    "UMT5Config",
+    "UniSpeechConfig",
+    "UniSpeechForCTC",
+    "UniSpeechSatConfig",
+    "UniSpeechSatForCTC",
+    "UperNetConfig",
+    "UperNetForSemanticSegmentation",
+    "ViTHybridImageProcessor",
+    "ViTHybridModel",
+    "ViTMSNModel",
+    "ViTModel",
+    "VideoClassificationPipeline",
+    "ViltConfig",
+    "ViltForImagesAndTextClassification",
+    "ViltModel",
+    "VisionEncoderDecoderModel",
+    "VisionTextDualEncoderModel",
+    "VisualBertConfig",
+    "VisualBertModel",
+    "VisualQuestionAnsweringPipeline",
+    "VitMatteForImageMatting",
+    "VitsTokenizer",
+    "VivitModel",
+    "Wav2Vec2CTCTokenizer",
+    "Wav2Vec2Config",
+    "Wav2Vec2ConformerConfig",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2FeatureExtractor",
+    "Wav2Vec2ForCTC",
+    "Wav2Vec2PhonemeCTCTokenizer",
+    "WavLMConfig",
+    "WavLMForCTC",
+    "WhisperConfig",
+    "WhisperFeatureExtractor",
+    "WhisperForAudioClassification",
+    "WhisperTokenizer",
+    "WhisperTokenizerFast",
+    "XCLIPTextConfig",
+    "XCLIPVisionConfig",
+    "XGLMConfig",
+    "XGLMModel",
+    "XGLMTokenizerFast",
+    "XLMConfig",
+    "XLMProphetNetConfig",
+    "XLMRobertaConfig",
+    "XLMRobertaModel",
+    "XLMRobertaTokenizerFast",
+    "XLMRobertaXLConfig",
+    "XLMRobertaXLModel",
+    "XLNetConfig",
+    "XLNetTokenizerFast",
+    "XmodConfig",
+    "XmodModel",
+    "YolosImageProcessor",
+    "YolosModel",
+    "YosoConfig",
+    "ZeroShotAudioClassificationPipeline",
+    "ZeroShotClassificationPipeline",
+    "ZeroShotImageClassificationPipeline",
+    "ZeroShotObjectDetectionPipeline",
+]
+
+# Supported math operations when interpreting the value of defaults.
+MATH_OPERATORS = {
+    ast.Add: op.add,
+    ast.Sub: op.sub,
+    ast.Mult: op.mul,
+    ast.Div: op.truediv,
+    ast.Pow: op.pow,
+    ast.BitXor: op.xor,
+    ast.USub: op.neg,
+}
+
+
+def find_indent(line: str) -> int:
+    """
+    Returns the number of spaces that start a line indent.
+    """
+    search = re.search(r"^(\s*)(?:\S|$)", line)
+    if search is None:
+        return 0
+    return len(search.groups()[0])
+
+
+def stringify_default(default: Any) -> str:
+    """
+    Returns the string representation of a default value, as used in docstring: numbers are left as is, all other
+    objects are in backtiks.
+
+    Args:
+        default (`Any`): The default value to process
+
+    Returns:
+        `str`: The string representation of that default.
+    """
+    if isinstance(default, bool):
+        # We need to test for bool first as a bool passes isinstance(xxx, (int, float))
+        return f"`{default}`"
+    elif isinstance(default, enum.Enum):
+        # We need to test for enum first as an enum with int values will pass isinstance(xxx, (int, float))
+        return f"`{str(default)}`"
+    elif isinstance(default, int):
+        return str(default)
+    elif isinstance(default, float):
+        result = str(default)
+        return str(round(default, 2)) if len(result) > 6 else result
+    elif isinstance(default, str):
+        return str(default) if default.isnumeric() else f'`"{default}"`'
+    elif isinstance(default, type):
+        return f"`{default.__name__}`"
+    else:
+        return f"`{default}`"
+
+
+def eval_math_expression(expression: str) -> Optional[Union[float, int]]:
+    # Mainly taken from the excellent https://stackoverflow.com/a/9558001
+    """
+    Evaluate (safely) a mathematial expression and returns its value.
+
+    Args:
+        expression (`str`): The expression to evaluate.
+
+    Returns:
+        `Optional[Union[float, int]]`: Returns `None` if the evaluation fails in any way and the value computed
+        otherwise.
+
+    Example:
+
+    ```py
+    >>> eval_expr('2^6')
+    4
+    >>> eval_expr('2**6')
+    64
+    >>> eval_expr('1 + 2*3**(4^5) / (6 + -7)')
+    -5.0
+    ```
+    """
+    try:
+        return eval_node(ast.parse(expression, mode="eval").body)
+    except TypeError:
+        return
+
+
+def eval_node(node):
+    if isinstance(node, ast.Num):  # <number>
+        return node.n
+    elif isinstance(node, ast.BinOp):  # <left> <operator> <right>
+        return MATH_OPERATORS[type(node.op)](eval_node(node.left), eval_node(node.right))
+    elif isinstance(node, ast.UnaryOp):  # <operator> <operand> e.g., -1
+        return MATH_OPERATORS[type(node.op)](eval_node(node.operand))
+    else:
+        raise TypeError(node)
+
+
+def replace_default_in_arg_description(description: str, default: Any) -> str:
+    """
+    Catches the default value in the description of an argument inside a docstring and replaces it by the value passed.
+
+    Args:
+        description (`str`): The description of an argument in a docstring to process.
+        default (`Any`): The default value that whould be in the docstring of that argument.
+
+    Returns:
+       `str`: The description updated with the new default value.
+    """
+    # Lots of docstrings have `optional` or **opational** instead of *optional* so we do this fix here.
+    description = description.replace("`optional`", OPTIONAL_KEYWORD)
+    description = description.replace("**optional**", OPTIONAL_KEYWORD)
+    if default is inspect._empty:
+        # No default, make sure the description doesn't have any either
+        idx = description.find(OPTIONAL_KEYWORD)
+        if idx != -1:
+            description = description[:idx].rstrip()
+            if description.endswith(","):
+                description = description[:-1].rstrip()
+    elif default is None:
+        # Default None are not written, we just set `*optional*`. If there is default that is not None specified in the
+        # description, we do not erase it (as sometimes we set the default to `None` because the default is a mutable
+        # object).
+        idx = description.find(OPTIONAL_KEYWORD)
+        if idx == -1:
+            description = f"{description}, {OPTIONAL_KEYWORD}"
+        elif re.search(r"defaults to `?None`?", description) is not None:
+            len_optional = len(OPTIONAL_KEYWORD)
+            description = description[: idx + len_optional]
+    else:
+        str_default = None
+        # For numbers we may have a default that is given by a math operation (1/255 is really popular). We don't
+        # want to replace those by their actual values.
+        if isinstance(default, (int, float)) and re.search("defaults to `?(.*?)(?:`|$)", description) is not None:
+            # Grab the default and evaluate it.
+            current_default = re.search("defaults to `?(.*?)(?:`|$)", description).groups()[0]
+            if default == eval_math_expression(current_default):
+                try:
+                    # If it can be directly converted to the type of the default, it's a simple value
+                    str_default = str(type(default)(current_default))
+                except Exception:
+                    # Otherwise there is a math operator so we add a code block.
+                    str_default = f"`{current_default}`"
+
+        if str_default is None:
+            str_default = stringify_default(default)
+        # Make sure default match
+        if OPTIONAL_KEYWORD not in description:
+            description = f"{description}, {OPTIONAL_KEYWORD}, defaults to {str_default}"
+        elif _re_parse_description.search(description) is None:
+            idx = description.find(OPTIONAL_KEYWORD)
+            len_optional = len(OPTIONAL_KEYWORD)
+            description = f"{description[:idx + len_optional]}, defaults to {str_default}"
+        else:
+            description = _re_parse_description.sub(rf"*optional*, defaults to {str_default}", description)
+
+    return description
+
+
+def get_default_description(arg: inspect.Parameter) -> str:
+    """
+    Builds a default description for a parameter that was not documented.
+
+    Args:
+        arg (`inspect.Parameter`): The argument in the signature to generate a description for.
+
+    Returns:
+        `str`: The description.
+    """
+    if arg.annotation is inspect._empty:
+        arg_type = "<fill_type>"
+    elif hasattr(arg.annotation, "__name__"):
+        arg_type = arg.annotation.__name__
+    else:
+        arg_type = str(arg.annotation)
+
+    if arg.default is inspect._empty:
+        return f"`{arg_type}`"
+    elif arg.default is None:
+        return f"`{arg_type}`, {OPTIONAL_KEYWORD}"
+    else:
+        str_default = stringify_default(arg.default)
+        return f"`{arg_type}`, {OPTIONAL_KEYWORD}, defaults to {str_default}"
+
+
+def find_source_file(obj: Any) -> Path:
+    """
+    Finds the source file of an object.
+
+    Args:
+        obj (`Any`): The object whose source file we are looking for.
+
+    Returns:
+        `Path`: The source file.
+    """
+    module = obj.__module__
+    obj_file = PATH_TO_TRANSFORMERS
+    for part in module.split(".")[1:]:
+        obj_file = obj_file / part
+    return obj_file.with_suffix(".py")
+
+
+def match_docstring_with_signature(obj: Any) -> Optional[Tuple[str, str]]:
+    """
+    Matches the docstring of an object with its signature.
+
+    Args:
+        obj (`Any`): The object to process.
+
+    Returns:
+        `Optional[Tuple[str, str]]`: Returns `None` if there is no docstring or no parameters documented in the
+        docstring, otherwise returns a tuple of two strings: the current documentation of the arguments in the
+        docstring and the one matched with the signature.
+    """
+    if len(getattr(obj, "__doc__", "")) == 0:
+        # Nothing to do, there is no docstring.
+        return
+
+    # Read the docstring in the source code to see if there is a special command to ignore this object.
+    try:
+        source, _ = inspect.getsourcelines(obj)
+    except OSError:
+        source = []
+
+    idx = 0
+    while idx < len(source) and '"""' not in source[idx]:
+        idx += 1
+
+    ignore_order = False
+    if idx < len(source):
+        line_before_docstring = source[idx - 1]
+        if re.search(r"^\s*#\s*no-format\s*$", line_before_docstring):
+            # This object is ignored
+            return
+        elif re.search(r"^\s*#\s*ignore-order\s*$", line_before_docstring):
+            ignore_order = True
+
+    # Read the signature
+    signature = inspect.signature(obj).parameters
+
+    obj_doc_lines = obj.__doc__.split("\n")
+    # Get to the line where we start documenting arguments
+    idx = 0
+    while idx < len(obj_doc_lines) and _re_args.search(obj_doc_lines[idx]) is None:
+        idx += 1
+
+    if idx == len(obj_doc_lines):
+        # Nothing to do, no parameters are documented.
+        return
+
+    indent = find_indent(obj_doc_lines[idx])
+    arguments = {}
+    current_arg = None
+    idx += 1
+    start_idx = idx
+    # Keep going until the arg section is finished (nonempty line at the same indent level) or the end of the docstring.
+    while idx < len(obj_doc_lines) and (
+        len(obj_doc_lines[idx].strip()) == 0 or find_indent(obj_doc_lines[idx]) > indent
+    ):
+        if find_indent(obj_doc_lines[idx]) == indent + 4:
+            # New argument -> let's generate the proper doc for it
+            re_search_arg = _re_parse_arg.search(obj_doc_lines[idx])
+            if re_search_arg is not None:
+                _, name, description = re_search_arg.groups()
+                current_arg = name
+                if name in signature:
+                    default = signature[name].default
+                    if signature[name].kind is inspect._ParameterKind.VAR_KEYWORD:
+                        default = None
+                    new_description = replace_default_in_arg_description(description, default)
+                else:
+                    new_description = description
+                init_doc = _re_parse_arg.sub(rf"\1\2 ({new_description}):", obj_doc_lines[idx])
+                arguments[current_arg] = [init_doc]
+        elif current_arg is not None:
+            arguments[current_arg].append(obj_doc_lines[idx])
+
+        idx += 1
+
+    # We went too far by one (perhaps more if there are a lot of new lines)
+    idx -= 1
+    while len(obj_doc_lines[idx].strip()) == 0:
+        arguments[current_arg] = arguments[current_arg][:-1]
+        idx -= 1
+    # And we went too far by one again.
+    idx += 1
+
+    old_doc_arg = "\n".join(obj_doc_lines[start_idx:idx])
+
+    old_arguments = list(arguments.keys())
+    arguments = {name: "\n".join(doc) for name, doc in arguments.items()}
+    # Add missing arguments with a template
+    for name in set(signature.keys()) - set(arguments.keys()):
+        arg = signature[name]
+        # We ignore private arguments or *args/**kwargs (unless they are documented by the user)
+        if name.startswith("_") or arg.kind in [
+            inspect._ParameterKind.VAR_KEYWORD,
+            inspect._ParameterKind.VAR_POSITIONAL,
+        ]:
+            arguments[name] = ""
+        else:
+            arg_desc = get_default_description(arg)
+            arguments[name] = " " * (indent + 4) + f"{name} ({arg_desc}): <fill_docstring>"
+
+    # Arguments are sorted by the order in the signature unless a special comment is put.
+    if ignore_order:
+        new_param_docs = [arguments[name] for name in old_arguments if name in signature]
+        missing = set(signature.keys()) - set(old_arguments)
+        new_param_docs.extend([arguments[name] for name in missing if len(arguments[name]) > 0])
+    else:
+        new_param_docs = [arguments[name] for name in signature.keys() if len(arguments[name]) > 0]
+    new_doc_arg = "\n".join(new_param_docs)
+
+    return old_doc_arg, new_doc_arg
+
+
+def fix_docstring(obj: Any, old_doc_args: str, new_doc_args: str):
+    """
+    Fixes the docstring of an object by replacing its arguments documentaiton by the one matched with the signature.
+
+    Args:
+        obj (`Any`):
+            The object whose dostring we are fixing.
+        old_doc_args (`str`):
+            The current documentation of the parameters of `obj` in the docstring (as returned by
+            `match_docstring_with_signature`).
+        new_doc_args (`str`):
+            The documentation of the parameters of `obj` matched with its signature (as returned by
+            `match_docstring_with_signature`).
+    """
+    # Read the docstring in the source code and make sure we have the right part of the docstring
+    source, line_number = inspect.getsourcelines(obj)
+
+    # Get to the line where we start documenting arguments
+    idx = 0
+    while idx < len(source) and _re_args.search(source[idx]) is None:
+        idx += 1
+
+    if idx == len(source):
+        # Args are not defined in the docstring of this object
+        return
+
+    # Get to the line where we stop documenting arguments
+    indent = find_indent(source[idx])
+    idx += 1
+    start_idx = idx
+    while idx < len(source) and (len(source[idx].strip()) == 0 or find_indent(source[idx]) > indent):
+        idx += 1
+
+    idx -= 1
+    while len(source[idx].strip()) == 0:
+        idx -= 1
+    idx += 1
+
+    if "".join(source[start_idx:idx])[:-1] != old_doc_args:
+        # Args are not fully defined in the docstring of this object
+        return
+
+    obj_file = find_source_file(obj)
+    with open(obj_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Replace content
+    lines = content.split("\n")
+    lines = lines[: line_number + start_idx - 1] + [new_doc_args] + lines[line_number + idx - 1 :]
+
+    print(f"Fixing the docstring of {obj.__name__} in {obj_file}.")
+    with open(obj_file, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
+def check_docstrings(overwrite: bool = False):
+    """
+    Check docstrings of all public objects that are callables and are documented.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether to fix inconsistencies or not.
+    """
+    failures = []
+    hard_failures = []
+    to_clean = []
+    for name in dir(transformers):
+        # Skip objects that are private or not documented.
+        if name.startswith("_") or ignore_undocumented(name) or name in OBJECTS_TO_IGNORE:
+            continue
+
+        obj = getattr(transformers, name)
+        if not callable(obj) or not isinstance(obj, type) or getattr(obj, "__doc__", None) is None:
+            continue
+
+        # Check docstring
+        try:
+            result = match_docstring_with_signature(obj)
+            if result is not None:
+                old_doc, new_doc = result
+            else:
+                old_doc, new_doc = None, None
+        except Exception as e:
+            print(e)
+            hard_failures.append(name)
+            continue
+        if old_doc != new_doc:
+            if overwrite:
+                fix_docstring(obj, old_doc, new_doc)
+            else:
+                failures.append(name)
+        elif not overwrite and new_doc is not None and ("<fill_type>" in new_doc or "<fill_docstring>" in new_doc):
+            to_clean.append(name)
+
+    # Deal with errors
+    error_message = ""
+    if len(hard_failures) > 0:
+        error_message += (
+            "The argument part of the docstrings of the following objects could not be processed, check they are "
+            "properly formatted."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in hard_failures])
+    if len(failures) > 0:
+        error_message += (
+            "The following objects docstrings do not match their signature. Run `make fix-copies` to fix this."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in failures])
+    if len(to_clean) > 0:
+        error_message += (
+            "The following objects docstrings contain templates you need to fix: search for `<fill_type>` or "
+            "`<fill_docstring>`."
+        )
+        error_message += "\n" + "\n".join([f"- {name}" for name in to_clean])
+
+    if len(error_message) > 0:
+        error_message = "There was at least one problem when checking docstrings of public objects.\n" + error_message
+        raise ValueError(error_message)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_docstrings(overwrite=args.fix_and_overwrite)