[Styling] stylify using ruff (#27144)

* try to stylify using ruff * might need to remove these changes? * use ruf format andruff check * use isinstance instead of type comparision * use # fmt: skip * use # fmt: skip * nits * soem styling changes * update ci job * nits isinstance * more files update * nits * more nits * small nits * check and format * revert wrong changes * actually use formatter instead of checker * nits * well docbuilder is overwriting this commit * revert notebook changes * try to nuke docbuilder * style * fix feature exrtaction test * remve `indent-width = 4` * fixup * more nits * update the ruff version that we use * style * nuke docbuilder styling * leve the print for detected changes * nits * Remove file I/O Co-authored-by: charliermarsh <charlie.r.marsh@gmail.com> * style * nits * revert notebook changes * Add # fmt skip when possible * Add # fmt skip when possible * Fix * More ` # fmt: skip` usage * More ` # fmt: skip` usage * More ` # fmt: skip` usage * NIts * more fixes * fix tapas * Another way to skip * Recommended way * Fix two more fiels * Remove asynch Remove asynch --------- Co-authored-by: charliermarsh <charlie.r.marsh@gmail.com>
2025-07-04 05:10:06 +06:00 · 2023-11-16 17:43:19 +01:00 · 2023-11-16 17:43:19 +01:00 · 651408a077
commit 651408a077
parent acb5b4aff5
480 changed files with 867 additions and 1059 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -157,11 +157,10 @@ jobs:
                command: pip freeze | tee installed.txt
            - store_artifacts:
                  path: ~/transformers/installed.txt
-            - run: black --check examples tests src utils
+            - run: ruff check examples tests src utils
-            - run: ruff examples tests src utils
+            - run: ruff format tests src utils --check
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
            - run: python utils/check_doc_toc.py
    check_repository_consistency:
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -15,7 +15,6 @@
 import argparse
 import copy
 import glob
 import os
 import random
 from dataclasses import dataclass
@ -239,7 +238,7 @@ class CircleCIJob:
        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("ERROR ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
        check_test_command += f"$(python3 -c '{py_command}'); "
-        check_test_command += f'cat summary_short.txt; echo ""; exit -1; '
+        check_test_command += 'cat summary_short.txt; echo ""; exit -1; '
        # Deeal with failed tests
        check_test_command += f'elif [ -s reports/{self.job_name}/failures_short.txt ]; '
@ -249,7 +248,7 @@ class CircleCIJob:
        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("FAILED ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
        check_test_command += f"$(python3 -c '{py_command}'); "
-        check_test_command += f'cat summary_short.txt; echo ""; exit -1; '
+        check_test_command += 'cat summary_short.txt; echo ""; exit -1; '
        check_test_command += f'elif [ -s reports/{self.job_name}/stats.txt ]; then echo "All tests pass!"; '
--- a/14
+++ b/14
@ -9,8 +9,8 @@ modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
 	@if test -n "$(modified_py_files)"; then \
 		echo "Checking/fixing $(modified_py_files)"; \
-		black $(modified_py_files); \
+		ruff check $(modified_py_files) --fix; \
-		ruff $(modified_py_files) --fix; \
+		ruff format $(modified_py_files);\
 	else \
 		echo "No library .py files were modified"; \
 	fi
@ -48,11 +48,10 @@ repo-consistency:
 # this target runs checks on all files
 quality:
-	black --check $(check_dirs) setup.py conftest.py
+	ruff check $(check_dirs) setup.py conftest.py
 	ruff format --check $(check_dirs) setup.py conftest.py
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
 	ruff $(check_dirs) setup.py conftest.py
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 # Format source code automatically and check is there are any problems left that need manual fixing
@ -60,14 +59,13 @@ quality:
 extra_style_checks:
 	python utils/custom_init_isort.py
 	python utils/sort_auto_mappings.py
 	doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
 	python utils/check_doc_toc.py --fix_and_overwrite
 # this target runs checks on all files and potentially modifies some of them
 style:
-	black $(check_dirs) setup.py conftest.py
+	ruff check $(check_dirs) setup.py conftest.py --fix
-	ruff $(check_dirs) setup.py conftest.py --fix
+	ruff format $(check_dirs) setup.py conftest.py
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@ -245,7 +245,7 @@ logits first, and then reshaped to match the size of the labels before you can c
 ...             reduce_labels=False,
 ...         )
 ...         for key, value in metrics.items():
-...             if type(value) is np.ndarray:
+...             if isinstance(value, np.ndarray):
 ...                 metrics[key] = value.tolist()
 ...         return metrics
 ```
--- a/docs/source/ko/tasks/semantic_segmentation.md
+++ b/docs/source/ko/tasks/semantic_segmentation.md
@ -242,7 +242,7 @@ pip install -q datasets transformers evaluate
 ...             reduce_labels=False,
 ...         )
 ...         for key, value in metrics.items():
-...             if type(value) is np.ndarray:
+...             if isinstance(value, np.ndarray):
 ...                 metrics[key] = value.tolist()
 ...         return metrics
 ```
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -212,7 +212,7 @@ class DataTrainingArguments:
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        self.task_name = self.task_name.lower() if type(self.task_name) == str else self.task_name
+        self.task_name = self.task_name.lower() if isinstance(self.task_name, str) else self.task_name
 def create_train_state(
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@ -23,7 +23,7 @@ class GLUETransformer(BaseTransformer):
    mode = "sequence-classification"
    def __init__(self, hparams):
-        if type(hparams) == dict:
+        if isinstance(hparams, dict):
            hparams = Namespace(**hparams)
        hparams.glue_output_mode = glue_output_modes[hparams.task]
        num_labels = glue_tasks_num_labels[hparams.task]
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@ -25,7 +25,7 @@ class NERTransformer(BaseTransformer):
    mode = "token-classification"
    def __init__(self, hparams):
-        if type(hparams) == dict:
+        if isinstance(hparams, dict):
            hparams = Namespace(**hparams)
        module = import_module("tasks")
        try:
--- a/examples/research_projects/deebert/src/modeling_highway_bert.py
+++ b/examples/research_projects/deebert/src/modeling_highway_bert.py
@ -32,7 +32,7 @@ class DeeBertEncoder(nn.Module):
        self.early_exit_entropy = [-1 for _ in range(config.num_hidden_layers)]
    def set_early_exit_entropy(self, x):
-        if (type(x) is float) or (type(x) is int):
+        if isinstance(x, (float, int)):
            for i in range(len(self.early_exit_entropy)):
                self.early_exit_entropy[i] = x
        else:
@ -232,9 +232,7 @@ class DeeBertModel(BertPreTrainedModel):
        outputs = (
            sequence_output,
            pooled_output,
-        ) + encoder_outputs[
+        ) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
            1:
        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions), highway exits
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ b/examples/research_projects/longform-qa/eli5_app.py
@ -158,9 +158,7 @@ header_full = """
    </span>
  </body>
 </html>
-""" % (
+""" % (header_html,)
    header_html,
 )
 st.sidebar.markdown(
    header_full,
    unsafe_allow_html=True,
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@ -1706,9 +1706,7 @@ class GeneralizedRCNN(nn.Module):
            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    from_tf
                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    pretrained_model_name_or_path + ".index"
                )
                archive_file = pretrained_model_name_or_path + ".index"
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@ -652,9 +652,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        outputs = (
            sequence_output,
            pooled_output,
-        ) + encoder_outputs[
+        ) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
            1:
        ]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@ -311,8 +311,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 or (
                # last step in epoch but step is always smaller than gradient_accumulation_steps
-                len(epoch_iterator) <= args.gradient_accumulation_steps
+                len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)
                and (step + 1) == len(epoch_iterator)
            ):
                if args.fp16:
                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py
@ -239,7 +239,7 @@ def print_model_summary(model, name_width=25, line_width=180, ignore=None):
            continue
        if type(mod) in ignore:
            continue
-        if [True for s in ignore if type(s) is str and s in name]:
+        if [True for s in ignore if isinstance(s, str) and s in name]:
            continue
        act_str = f"Act:{input_q.extra_repr()}"
        wgt_str = f"Wgt:{weight_q.extra_repr()}"
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@ -1706,9 +1706,7 @@ class GeneralizedRCNN(nn.Module):
            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    from_tf
                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    pretrained_model_name_or_path + ".index"
                )
                archive_file = pretrained_model_name_or_path + ".index"
--- a/hubconf.py
+++ b/hubconf.py
@ -15,6 +15,7 @@
 import os
 import sys
 SRC_DIR = os.path.join(os.path.dirname(__file__), "src")
 sys.path.append(SRC_DIR)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,10 +1,6 @@
 [tool.black]
 line-length = 119
 target-version = ['py37']
 [tool.ruff]
 # Never enforce `E501` (line length violations).
-ignore = ["C901", "E501", "E741"]
+ignore = ["C901", "E501", "E741", "F402", "F823" ]
 select = ["C", "E", "F", "I", "W"]
 line-length = 119
@ -18,6 +14,19 @@ line-length = 119
 lines-after-imports = 2
 known-first-party = ["transformers"]
 [tool.ruff.format]
 # Like Black, use double quotes for strings.
 quote-style = "double"
 # Like Black, indent with spaces, rather than tabs.
 indent-style = "space"
 # Like Black, respect magic trailing commas.
 skip-magic-trailing-comma = false
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
 [tool.pytest.ini_options]
 doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
 doctest_glob="**/*.md"
--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@ -1,10 +1,12 @@
 from collections import Counter
 import datasets
 import transformers
 from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
 from transformers.utils import logging
 logging.set_verbosity_info()
 TOKENIZER_CLASSES = {
@ -101,8 +103,8 @@ def check_details(line, spm_ids, tok_ids, slow, fast):
    except Exception:
        pass
-    ok_start = fast.decode(spm_ids[:first])
+    fast.decode(spm_ids[:first])
-    ok_end = fast.decode(spm_ids[last:])
+    fast.decode(spm_ids[last:])
    wrong = fast.decode(spm_ids[first:last])
    print()
    print(wrong)
--- a/scripts/fsmt/fsmt-make-super-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-super-tiny-model.py
@ -24,13 +24,14 @@
 #
 # It will be used then as "stas/tiny-wmt19-en-ru"
 from pathlib import Path
 import json
 import tempfile
 from pathlib import Path
-from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration
+from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
 from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
 mname_tiny = "tiny-wmt19-en-ru"
 # Build
--- a/scripts/fsmt/fsmt-make-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-tiny-model.py
@ -27,16 +27,18 @@
 # It will be used then as "stas/tiny-wmt19-en-de"
 # Build
-from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration
+from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
 mname = "facebook/wmt19-en-de"
 tokenizer = FSMTTokenizer.from_pretrained(mname)
 # get the correct vocab sizes, etc. from the master model
 config = FSMTConfig.from_pretrained(mname)
-config.update(dict(
+config.update({
-    d_model=4,
+    "d_model": 4,
-    encoder_layers=1, decoder_layers=1,
+    "encoder_layers": 1, "decoder_layers": 1,
-    encoder_ffn_dim=4, decoder_ffn_dim=4,
+    "encoder_ffn_dim": 4, "decoder_ffn_dim": 4,
-    encoder_attention_heads=1, decoder_attention_heads=1))
+    "encoder_attention_heads": 1, "decoder_attention_heads": 1})
 tiny_model = FSMTForConditionalGeneration(config)
 print(f"num of params {tiny_model.num_parameters()}")
--- a/scripts/fsmt/gen-card-allenai-wmt16.py
+++ b/scripts/fsmt/gen-card-allenai-wmt16.py
@ -19,6 +19,7 @@
 import os
 from pathlib import Path
 def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
    texts = {
--- a/scripts/fsmt/gen-card-allenai-wmt19.py
+++ b/scripts/fsmt/gen-card-allenai-wmt19.py
@ -19,6 +19,7 @@
 import os
 from pathlib import Path
 def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
    texts = {
--- a/scripts/fsmt/gen-card-facebook-wmt19.py
+++ b/scripts/fsmt/gen-card-facebook-wmt19.py
@ -19,6 +19,7 @@
 import os
 from pathlib import Path
 def write_model_card(model_card_dir, src_lang, tgt_lang):
    texts = {
--- a/scripts/pegasus/build_test_sample_spm_no_bos.py
+++ b/scripts/pegasus/build_test_sample_spm_no_bos.py
@ -22,6 +22,7 @@
 # 3. build
 import sentencepiece as spm
 # pegasus:
 # 1. no bos
 # 2. eos_id is 1
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -15,8 +15,8 @@
 Script to close stale issue. Taken in part from the AllenNLP repository.
 https://github.com/allenai/allennlp.
 """
 from datetime import datetime as dt
 import os
 from datetime import datetime as dt
 import github.GithubException
 from github import Github
@ -39,7 +39,7 @@ def main():
    for i, issue in enumerate(open_issues):
        print(i, issue)
-        comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
+        comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
        last_comment = comments[0] if len(comments) > 0 else None
        if (
            last_comment is not None and last_comment.user.login == "github-actions[bot]"
--- a/setup.py
+++ b/setup.py
@ -99,7 +99,6 @@ _deps = [
    "accelerate>=0.20.3",
    "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
    "beautifulsoup4",
    "black~=23.1",
    "codecarbon==1.2.0",
    "cookiecutter==1.7.3",
    "dataclasses",
@ -156,7 +155,7 @@ _deps = [
    "rhoknp>=1.1.0,<1.3.1",
    "rjieba",
    "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff>=0.0.241,<=0.0.259",
+    "ruff>=0.1.5,<=0.2",
    "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses",
    "safetensors>=0.3.1",
@ -310,7 +309,7 @@ extras["testing"] = (
        "dill",
        "evaluate",
        "pytest-timeout",
-        "black",
+        "ruff",
        "sacrebleu",
        "rouge-score",
        "nltk",
@ -329,7 +328,7 @@ extras["testing"] = (
 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
-extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3")
+extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3")
 extras["all"] = (
    extras["tf"]
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -246,6 +246,7 @@ class PretrainedConfig(PushToHubMixin):
            not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
            v5.
    """
    model_type: str = ""
    is_composition: bool = False
    attribute_map: Dict[str, str] = {}
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -724,9 +724,7 @@ class MBart50Converter(SpmConverter):
            ("<unk>", 0.0),
        ]
        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
-        # fmt: off
+        vocab += [("ar_AR", 0.0), ("cs_CZ", 0.0), ("de_DE", 0.0), ("en_XX", 0.0), ("es_XX", 0.0), ("et_EE", 0.0), ("fi_FI", 0.0), ("fr_XX", 0.0), ("gu_IN", 0.0), ("hi_IN", 0.0), ("it_IT", 0.0), ("ja_XX", 0.0), ("kk_KZ", 0.0), ("ko_KR", 0.0), ("lt_LT", 0.0), ("lv_LV", 0.0), ("my_MM", 0.0), ("ne_NP", 0.0), ("nl_XX", 0.0), ("ro_RO", 0.0), ("ru_RU", 0.0), ("si_LK", 0.0), ("tr_TR", 0.0), ("vi_VN", 0.0), ("zh_CN", 0.0), ("af_ZA", 0.0), ("az_AZ", 0.0), ("bn_IN", 0.0), ("fa_IR", 0.0), ("he_IL", 0.0), ("hr_HR", 0.0), ("id_ID", 0.0), ("ka_GE", 0.0), ("km_KH", 0.0), ("mk_MK", 0.0), ("ml_IN", 0.0), ("mn_MN", 0.0), ("mr_IN", 0.0), ("pl_PL", 0.0), ("ps_AF", 0.0), ("pt_XX", 0.0), ("sv_SE", 0.0), ("sw_KE", 0.0), ("ta_IN", 0.0), ("te_IN", 0.0), ("th_TH", 0.0), ("tl_XX", 0.0), ("uk_UA", 0.0), ("ur_PK", 0.0), ("xh_ZA", 0.0), ("gl_ES", 0.0), ("sl_SI", 0.0)]  # fmt: skip
        vocab += [("ar_AR", 0.0), ("cs_CZ", 0.0), ("de_DE", 0.0), ("en_XX", 0.0), ("es_XX", 0.0), ("et_EE", 0.0), ("fi_FI", 0.0), ("fr_XX", 0.0), ("gu_IN", 0.0), ("hi_IN", 0.0), ("it_IT", 0.0), ("ja_XX", 0.0), ("kk_KZ", 0.0), ("ko_KR", 0.0), ("lt_LT", 0.0), ("lv_LV", 0.0), ("my_MM", 0.0), ("ne_NP", 0.0), ("nl_XX", 0.0), ("ro_RO", 0.0), ("ru_RU", 0.0), ("si_LK", 0.0), ("tr_TR", 0.0), ("vi_VN", 0.0), ("zh_CN", 0.0), ("af_ZA", 0.0), ("az_AZ", 0.0), ("bn_IN", 0.0), ("fa_IR", 0.0), ("he_IL", 0.0), ("hr_HR", 0.0), ("id_ID", 0.0), ("ka_GE", 0.0), ("km_KH", 0.0), ("mk_MK", 0.0), ("ml_IN", 0.0), ("mn_MN", 0.0), ("mr_IN", 0.0), ("pl_PL", 0.0), ("ps_AF", 0.0), ("pt_XX", 0.0), ("sv_SE", 0.0), ("sw_KE", 0.0), ("ta_IN", 0.0), ("te_IN", 0.0), ("th_TH", 0.0), ("tl_XX", 0.0), ("uk_UA", 0.0), ("ur_PK", 0.0), ("xh_ZA", 0.0), ("gl_ES", 0.0), ("sl_SI", 0.0)]
        # fmt: on
        vocab += [("<mask>", 0.0)]
        return vocab
@ -753,11 +751,7 @@ class NllbConverter(SpmConverter):
            ("<unk>", 0.0),
        ]
        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
-        vocab += [
+        vocab += [('ace_Arab', 0.0), ('ace_Latn', 0.0), ('acm_Arab', 0.0), ('acq_Arab', 0.0), ('aeb_Arab', 0.0), ('afr_Latn', 0.0), ('ajp_Arab', 0.0), ('aka_Latn', 0.0), ('amh_Ethi', 0.0), ('apc_Arab', 0.0), ('arb_Arab', 0.0), ('ars_Arab', 0.0), ('ary_Arab', 0.0), ('arz_Arab', 0.0), ('asm_Beng', 0.0), ('ast_Latn', 0.0), ('awa_Deva', 0.0), ('ayr_Latn', 0.0), ('azb_Arab', 0.0), ('azj_Latn', 0.0), ('bak_Cyrl', 0.0), ('bam_Latn', 0.0), ('ban_Latn', 0.0), ('bel_Cyrl', 0.0), ('bem_Latn', 0.0), ('ben_Beng', 0.0), ('bho_Deva', 0.0), ('bjn_Arab', 0.0), ('bjn_Latn', 0.0), ('bod_Tibt', 0.0), ('bos_Latn', 0.0), ('bug_Latn', 0.0), ('bul_Cyrl', 0.0), ('cat_Latn', 0.0), ('ceb_Latn', 0.0), ('ces_Latn', 0.0), ('cjk_Latn', 0.0), ('ckb_Arab', 0.0), ('crh_Latn', 0.0), ('cym_Latn', 0.0), ('dan_Latn', 0.0), ('deu_Latn', 0.0), ('dik_Latn', 0.0), ('dyu_Latn', 0.0), ('dzo_Tibt', 0.0), ('ell_Grek', 0.0), ('eng_Latn', 0.0), ('epo_Latn', 0.0), ('est_Latn', 0.0), ('eus_Latn', 0.0), ('ewe_Latn', 0.0), ('fao_Latn', 0.0), ('pes_Arab', 0.0), ('fij_Latn', 0.0), ('fin_Latn', 0.0), ('fon_Latn', 0.0), ('fra_Latn', 0.0), ('fur_Latn', 0.0), ('fuv_Latn', 0.0), ('gla_Latn', 0.0), ('gle_Latn', 0.0), ('glg_Latn', 0.0), ('grn_Latn', 0.0), ('guj_Gujr', 0.0), ('hat_Latn', 0.0), ('hau_Latn', 0.0), ('heb_Hebr', 0.0), ('hin_Deva', 0.0), ('hne_Deva', 0.0), ('hrv_Latn', 0.0), ('hun_Latn', 0.0), ('hye_Armn', 0.0), ('ibo_Latn', 0.0), ('ilo_Latn', 0.0), ('ind_Latn', 0.0), ('isl_Latn', 0.0), ('ita_Latn', 0.0), ('jav_Latn', 0.0), ('jpn_Jpan', 0.0), ('kab_Latn', 0.0), ('kac_Latn', 0.0), ('kam_Latn', 0.0), ('kan_Knda', 0.0), ('kas_Arab', 0.0), ('kas_Deva', 0.0), ('kat_Geor', 0.0), ('knc_Arab', 0.0), ('knc_Latn', 0.0), ('kaz_Cyrl', 0.0), ('kbp_Latn', 0.0), ('kea_Latn', 0.0), ('khm_Khmr', 0.0), ('kik_Latn', 0.0), ('kin_Latn', 0.0), ('kir_Cyrl', 0.0), ('kmb_Latn', 0.0), ('kon_Latn', 0.0), ('kor_Hang', 0.0), ('kmr_Latn', 0.0), ('lao_Laoo', 0.0), ('lvs_Latn', 0.0), ('lij_Latn', 0.0), ('lim_Latn', 0.0), ('lin_Latn', 0.0), ('lit_Latn', 0.0), ('lmo_Latn', 0.0), ('ltg_Latn', 0.0), ('ltz_Latn', 0.0), ('lua_Latn', 0.0), ('lug_Latn', 0.0), ('luo_Latn', 0.0), ('lus_Latn', 0.0), ('mag_Deva', 0.0), ('mai_Deva', 0.0), ('mal_Mlym', 0.0), ('mar_Deva', 0.0), ('min_Latn', 0.0), ('mkd_Cyrl', 0.0), ('plt_Latn', 0.0), ('mlt_Latn', 0.0), ('mni_Beng', 0.0), ('khk_Cyrl', 0.0), ('mos_Latn', 0.0), ('mri_Latn', 0.0), ('zsm_Latn', 0.0), ('mya_Mymr', 0.0), ('nld_Latn', 0.0), ('nno_Latn', 0.0), ('nob_Latn', 0.0), ('npi_Deva', 0.0), ('nso_Latn', 0.0), ('nus_Latn', 0.0), ('nya_Latn', 0.0), ('oci_Latn', 0.0), ('gaz_Latn', 0.0), ('ory_Orya', 0.0), ('pag_Latn', 0.0), ('pan_Guru', 0.0), ('pap_Latn', 0.0), ('pol_Latn', 0.0), ('por_Latn', 0.0), ('prs_Arab', 0.0), ('pbt_Arab', 0.0), ('quy_Latn', 0.0), ('ron_Latn', 0.0), ('run_Latn', 0.0), ('rus_Cyrl', 0.0), ('sag_Latn', 0.0), ('san_Deva', 0.0), ('sat_Beng', 0.0), ('scn_Latn', 0.0), ('shn_Mymr', 0.0), ('sin_Sinh', 0.0), ('slk_Latn', 0.0), ('slv_Latn', 0.0), ('smo_Latn', 0.0), ('sna_Latn', 0.0), ('snd_Arab', 0.0), ('som_Latn', 0.0), ('sot_Latn', 0.0), ('spa_Latn', 0.0), ('als_Latn', 0.0), ('srd_Latn', 0.0), ('srp_Cyrl', 0.0), ('ssw_Latn', 0.0), ('sun_Latn', 0.0), ('swe_Latn', 0.0), ('swh_Latn', 0.0), ('szl_Latn', 0.0), ('tam_Taml', 0.0), ('tat_Cyrl', 0.0), ('tel_Telu', 0.0), ('tgk_Cyrl', 0.0), ('tgl_Latn', 0.0), ('tha_Thai', 0.0), ('tir_Ethi', 0.0), ('taq_Latn', 0.0), ('taq_Tfng', 0.0), ('tpi_Latn', 0.0), ('tsn_Latn', 0.0), ('tso_Latn', 0.0), ('tuk_Latn', 0.0), ('tum_Latn', 0.0), ('tur_Latn', 0.0), ('twi_Latn', 0.0), ('tzm_Tfng', 0.0), ('uig_Arab', 0.0), ('ukr_Cyrl', 0.0), ('umb_Latn', 0.0), ('urd_Arab', 0.0), ('uzn_Latn', 0.0), ('vec_Latn', 0.0), ('vie_Latn', 0.0), ('war_Latn', 0.0), ('wol_Latn', 0.0), ('xho_Latn', 0.0), ('ydd_Hebr', 0.0), ('yor_Latn', 0.0), ('yue_Hant', 0.0), ('zho_Hans', 0.0), ('zho_Hant', 0.0), ('zul_Latn', 0.0)]  # fmt: skip
            # fmt: off
            ('ace_Arab', 0.0), ('ace_Latn', 0.0), ('acm_Arab', 0.0), ('acq_Arab', 0.0), ('aeb_Arab', 0.0), ('afr_Latn', 0.0), ('ajp_Arab', 0.0), ('aka_Latn', 0.0), ('amh_Ethi', 0.0), ('apc_Arab', 0.0), ('arb_Arab', 0.0), ('ars_Arab', 0.0), ('ary_Arab', 0.0), ('arz_Arab', 0.0), ('asm_Beng', 0.0), ('ast_Latn', 0.0), ('awa_Deva', 0.0), ('ayr_Latn', 0.0), ('azb_Arab', 0.0), ('azj_Latn', 0.0), ('bak_Cyrl', 0.0), ('bam_Latn', 0.0), ('ban_Latn', 0.0), ('bel_Cyrl', 0.0), ('bem_Latn', 0.0), ('ben_Beng', 0.0), ('bho_Deva', 0.0), ('bjn_Arab', 0.0), ('bjn_Latn', 0.0), ('bod_Tibt', 0.0), ('bos_Latn', 0.0), ('bug_Latn', 0.0), ('bul_Cyrl', 0.0), ('cat_Latn', 0.0), ('ceb_Latn', 0.0), ('ces_Latn', 0.0), ('cjk_Latn', 0.0), ('ckb_Arab', 0.0), ('crh_Latn', 0.0), ('cym_Latn', 0.0), ('dan_Latn', 0.0), ('deu_Latn', 0.0), ('dik_Latn', 0.0), ('dyu_Latn', 0.0), ('dzo_Tibt', 0.0), ('ell_Grek', 0.0), ('eng_Latn', 0.0), ('epo_Latn', 0.0), ('est_Latn', 0.0), ('eus_Latn', 0.0), ('ewe_Latn', 0.0), ('fao_Latn', 0.0), ('pes_Arab', 0.0), ('fij_Latn', 0.0), ('fin_Latn', 0.0), ('fon_Latn', 0.0), ('fra_Latn', 0.0), ('fur_Latn', 0.0), ('fuv_Latn', 0.0), ('gla_Latn', 0.0), ('gle_Latn', 0.0), ('glg_Latn', 0.0), ('grn_Latn', 0.0), ('guj_Gujr', 0.0), ('hat_Latn', 0.0), ('hau_Latn', 0.0), ('heb_Hebr', 0.0), ('hin_Deva', 0.0), ('hne_Deva', 0.0), ('hrv_Latn', 0.0), ('hun_Latn', 0.0), ('hye_Armn', 0.0), ('ibo_Latn', 0.0), ('ilo_Latn', 0.0), ('ind_Latn', 0.0), ('isl_Latn', 0.0), ('ita_Latn', 0.0), ('jav_Latn', 0.0), ('jpn_Jpan', 0.0), ('kab_Latn', 0.0), ('kac_Latn', 0.0), ('kam_Latn', 0.0), ('kan_Knda', 0.0), ('kas_Arab', 0.0), ('kas_Deva', 0.0), ('kat_Geor', 0.0), ('knc_Arab', 0.0), ('knc_Latn', 0.0), ('kaz_Cyrl', 0.0), ('kbp_Latn', 0.0), ('kea_Latn', 0.0), ('khm_Khmr', 0.0), ('kik_Latn', 0.0), ('kin_Latn', 0.0), ('kir_Cyrl', 0.0), ('kmb_Latn', 0.0), ('kon_Latn', 0.0), ('kor_Hang', 0.0), ('kmr_Latn', 0.0), ('lao_Laoo', 0.0), ('lvs_Latn', 0.0), ('lij_Latn', 0.0), ('lim_Latn', 0.0), ('lin_Latn', 0.0), ('lit_Latn', 0.0), ('lmo_Latn', 0.0), ('ltg_Latn', 0.0), ('ltz_Latn', 0.0), ('lua_Latn', 0.0), ('lug_Latn', 0.0), ('luo_Latn', 0.0), ('lus_Latn', 0.0), ('mag_Deva', 0.0), ('mai_Deva', 0.0), ('mal_Mlym', 0.0), ('mar_Deva', 0.0), ('min_Latn', 0.0), ('mkd_Cyrl', 0.0), ('plt_Latn', 0.0), ('mlt_Latn', 0.0), ('mni_Beng', 0.0), ('khk_Cyrl', 0.0), ('mos_Latn', 0.0), ('mri_Latn', 0.0), ('zsm_Latn', 0.0), ('mya_Mymr', 0.0), ('nld_Latn', 0.0), ('nno_Latn', 0.0), ('nob_Latn', 0.0), ('npi_Deva', 0.0), ('nso_Latn', 0.0), ('nus_Latn', 0.0), ('nya_Latn', 0.0), ('oci_Latn', 0.0), ('gaz_Latn', 0.0), ('ory_Orya', 0.0), ('pag_Latn', 0.0), ('pan_Guru', 0.0), ('pap_Latn', 0.0), ('pol_Latn', 0.0), ('por_Latn', 0.0), ('prs_Arab', 0.0), ('pbt_Arab', 0.0), ('quy_Latn', 0.0), ('ron_Latn', 0.0), ('run_Latn', 0.0), ('rus_Cyrl', 0.0), ('sag_Latn', 0.0), ('san_Deva', 0.0), ('sat_Beng', 0.0), ('scn_Latn', 0.0), ('shn_Mymr', 0.0), ('sin_Sinh', 0.0), ('slk_Latn', 0.0), ('slv_Latn', 0.0), ('smo_Latn', 0.0), ('sna_Latn', 0.0), ('snd_Arab', 0.0), ('som_Latn', 0.0), ('sot_Latn', 0.0), ('spa_Latn', 0.0), ('als_Latn', 0.0), ('srd_Latn', 0.0), ('srp_Cyrl', 0.0), ('ssw_Latn', 0.0), ('sun_Latn', 0.0), ('swe_Latn', 0.0), ('swh_Latn', 0.0), ('szl_Latn', 0.0), ('tam_Taml', 0.0), ('tat_Cyrl', 0.0), ('tel_Telu', 0.0), ('tgk_Cyrl', 0.0), ('tgl_Latn', 0.0), ('tha_Thai', 0.0), ('tir_Ethi', 0.0), ('taq_Latn', 0.0), ('taq_Tfng', 0.0), ('tpi_Latn', 0.0), ('tsn_Latn', 0.0), ('tso_Latn', 0.0), ('tuk_Latn', 0.0), ('tum_Latn', 0.0), ('tur_Latn', 0.0), ('twi_Latn', 0.0), ('tzm_Tfng', 0.0), ('uig_Arab', 0.0), ('ukr_Cyrl', 0.0), ('umb_Latn', 0.0), ('urd_Arab', 0.0), ('uzn_Latn', 0.0), ('vec_Latn', 0.0), ('vie_Latn', 0.0), ('war_Latn', 0.0), ('wol_Latn', 0.0), ('xho_Latn', 0.0), ('ydd_Hebr', 0.0), ('yor_Latn', 0.0), ('yue_Hant', 0.0), ('zho_Hans', 0.0), ('zho_Hant', 0.0), ('zul_Latn', 0.0)
            # fmt: on
        ]
        vocab += [("<mask>", 0.0)]
        return vocab
@ -1128,9 +1122,7 @@ class XGLMConverter(SpmConverter):
            ("<unk>", 0.0),
        ]
        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
-        # fmt: off
+        vocab += [("<madeupword0>", 0.0), ("<madeupword1>", 0.0), ("<madeupword2>", 0.0), ("<madeupword3>", 0.0), ("<madeupword4>", 0.0), ("<madeupword5>", 0.0), ("<madeupword6>", 0.0)]  # fmt: skip
        vocab += [("<madeupword0>", 0.0), ("<madeupword1>", 0.0), ("<madeupword2>", 0.0), ("<madeupword3>", 0.0), ("<madeupword4>", 0.0), ("<madeupword5>", 0.0), ("<madeupword6>", 0.0)]
        # fmt: on
        return vocab
    def unk_id(self, proto):
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -121,7 +121,7 @@ def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any
        if isinstance(first["label_ids"], torch.Tensor):
            batch["labels"] = torch.stack([f["label_ids"] for f in features])
        else:
-            dtype = torch.long if type(first["label_ids"][0]) is int else torch.float
+            dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
    # Handling of all other possible keys.
@ -196,7 +196,7 @@ def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any
        if isinstance(first["label_ids"], np.ndarray):
            batch["labels"] = np.stack([f["label_ids"] for f in features])
        else:
-            dtype = np.int64 if type(first["label_ids"][0]) is int else np.float32
+            dtype = np.int64 if isinstance(first["label_ids"][0], int) else np.float32
            batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
    # Handling of all other possible keys.
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -6,7 +6,6 @@ deps = {
    "accelerate": "accelerate>=0.20.3",
    "av": "av==9.2.0",
    "beautifulsoup4": "beautifulsoup4",
    "black": "black~=23.1",
    "codecarbon": "codecarbon==1.2.0",
    "cookiecutter": "cookiecutter==1.7.3",
    "dataclasses": "dataclasses",
@ -62,7 +61,7 @@ deps = {
    "rhoknp": "rhoknp>=1.1.0,<1.3.1",
    "rjieba": "rjieba",
    "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff": "ruff>=0.0.241,<=0.0.259",
+    "ruff": "ruff>=0.1.5,<=0.2",
    "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses": "sacremoses",
    "safetensors": "safetensors>=0.3.1",
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -245,8 +245,7 @@ def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]
        and isinstance(annotation["annotations"], (list, tuple))
        and (
            # an image can have no annotations
-            len(annotation["annotations"]) == 0
+            len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
            or isinstance(annotation["annotations"][0], dict)
        )
    ):
        return True
@ -262,8 +261,7 @@ def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]])
        and isinstance(annotation["segments_info"], (list, tuple))
        and (
            # an image can have no segments
-            len(annotation["segments_info"]) == 0
+            len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
            or isinstance(annotation["segments_info"][0], dict)
        )
    ):
        return True
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@ -179,6 +179,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    """
    config_class = None
    base_model_prefix = ""
    main_input_name = "input_ids"
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@ -1075,6 +1075,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    """
    config_class = None
    base_model_prefix = ""
    main_input_name = "input_ids"
@ -3242,6 +3243,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
        kwargs (`Dict[str, Any]`, *optional*):
            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
    """
    # TODO (joao): flagged for delection due to embeddings refactor
    def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -1095,6 +1095,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    """
    config_class = None
    base_model_prefix = ""
    main_input_name = "input_ids"
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@ -97,6 +97,7 @@ class AlignTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "align_text_model"
    def __init__(
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@ -100,6 +100,7 @@ class AltCLIPTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "altclip_text_model"
    def __init__(
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@ -174,8 +174,7 @@ class AltCLIPOutput(ModelOutput):
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of
+            The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
            [`AltCLIPVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`AltCLIPTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
@ -1049,9 +1048,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        elif isinstance(module, AltCLIPMLP):
            factor = self.config.initializer_factor
-            in_proj_std = (
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            )
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@ -35,6 +35,7 @@ class AltCLIPProcessor(ProcessorMixin):
        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@ -86,6 +86,7 @@ class ASTConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "audio-spectrogram-transformer"
    def __init__(
--- a/src/transformers/models/autoformer/configuration_autoformer.py
+++ b/src/transformers/models/autoformer/configuration_autoformer.py
@ -131,6 +131,7 @@ class AutoformerConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "autoformer"
    attribute_map = {
        "hidden_size": "d_model",
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@ -46,6 +46,7 @@ class BarkProcessor(ProcessorMixin):
            a list of `voice_preset_names`.
    """
    tokenizer_class = "AutoTokenizer"
    attributes = ["tokenizer"]
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@ -107,6 +107,7 @@ class BartConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bart"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@ -147,6 +147,7 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether the post processing step should trim offsets to avoid including whitespaces.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@ -115,6 +115,7 @@ class BeitConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "beit"
    def __init__(
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@ -136,6 +136,7 @@ class BertConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bert"
    def __init__(
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@ -84,6 +84,7 @@ class BertGenerationConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bert-generation"
    def __init__(
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@ -104,6 +104,7 @@ class BigBirdConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "big_bird"
    def __init__(
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@ -896,15 +896,11 @@ class BigBirdBlockSparseAttention(nn.Module):
            # global keys (corresponding to 1st key block)
            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
                :, :, :, :, :to_block_size
-            ].view(
+            ].view(bsz, n_heads, -1, to_block_size)  # first_band_product
                bsz, n_heads, -1, to_block_size
            )  # first_band_product
            # global keys (corresponding to last key block)
            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[
                :, :, :, :, -to_block_size:
-            ].view(
+            ].view(bsz, n_heads, -1, to_block_size)  # last_band_product
                bsz, n_heads, -1, to_block_size
            )  # last_band_product
            # random keys
            for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights):
                # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@ -120,6 +120,7 @@ class BigBirdPegasusConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bigbird_pegasus"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@ -683,15 +683,11 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
            # global keys (corresponding to 1st key block)
            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
                :, :, :, :, :to_block_size
-            ].view(
+            ].view(bsz, n_heads, -1, to_block_size)  # first_band_product
                bsz, n_heads, -1, to_block_size
            )  # first_band_product
            # global keys (corresponding to last key block)
            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[
                :, :, :, :, -to_block_size:
-            ].view(
+            ].view(bsz, n_heads, -1, to_block_size)  # last_band_product
                bsz, n_heads, -1, to_block_size
            )  # last_band_product
            # random keys
            for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights):
                # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
--- a/src/transformers/models/biogpt/configuration_biogpt.py
+++ b/src/transformers/models/biogpt/configuration_biogpt.py
@ -93,6 +93,7 @@ class BioGptConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "biogpt"
    def __init__(
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@ -85,6 +85,7 @@ class BitConfig(BackboneConfigMixin, PretrainedConfig):
    >>> configuration = model.config
    ```
    """
    model_type = "bit"
    layer_types = ["preactivation", "bottleneck"]
    supported_padding = ["SAME", "VALID"]
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@ -104,6 +104,7 @@ class BlenderbotConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "blenderbot"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@ -1511,9 +1511,7 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
        >>> from transformers import AutoTokenizer, BlenderbotForCausalLM
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
-        >>> model = BlenderbotForCausalLM.from_pretrained(
+        >>> model = BlenderbotForCausalLM.from_pretrained("facebook/blenderbot-400M-distill", add_cross_attention=False)
        ...     "facebook/blenderbot-400M-distill", add_cross_attention=False
        ... )
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@ -376,8 +376,8 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
-        not make use of token type ids, therefore a list of zeros is returned.
+        make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (`List[int]`):
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@ -212,8 +212,8 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.
-        Blenderbot tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will
+        Blenderbot tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        greedily comprise the space before the *<mask>*.
+        comprise the space before the *<mask>*.
        """
        if self._mask_token is None:
            if self.verbose:
@ -264,8 +264,8 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
-        not make use of token type ids, therefore a list of zeros is returned.
+        make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (`List[int]`):
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@ -104,6 +104,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "blenderbot-small"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@ -1478,9 +1478,7 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
        >>> from transformers import AutoTokenizer, BlenderbotSmallForCausalLM
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
-        >>> model = BlenderbotSmallForCausalLM.from_pretrained(
+        >>> model = BlenderbotSmallForCausalLM.from_pretrained("facebook/blenderbot_small-90M", add_cross_attention=False)
        ...     "facebook/blenderbot_small-90M", add_cross_attention=False
        ... )
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@ -109,6 +109,7 @@ class BlipTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "blip_text_model"
    def __init__(
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@ -742,13 +742,13 @@ class BlipTextModel(BlipTextPreTrainedModel):
        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if encoder_hidden_states is not None:
-            if type(encoder_hidden_states) == list:
+            if isinstance(encoder_hidden_states, list):
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
            else:
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if type(encoder_attention_mask) == list:
+            if isinstance(encoder_attention_mask, list):
                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
            elif encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@ -741,13 +741,13 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if encoder_hidden_states is not None:
-            if type(encoder_hidden_states) == list:
+            if isinstance(encoder_hidden_states, list):
                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0])
            else:
                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states)
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if type(encoder_attention_mask) == list:
+            if isinstance(encoder_attention_mask, list):
                encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask]
            elif encoder_attention_mask is None:
                encoder_attention_mask = tf.ones(encoder_hidden_shape)
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@ -37,6 +37,7 @@ class BlipProcessor(ProcessorMixin):
        tokenizer (`BertTokenizerFast`):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@ -190,6 +190,7 @@ class Blip2QFormerConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "blip_2_qformer"
    def __init__(
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@ -1123,13 +1123,13 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if encoder_hidden_states is not None:
-            if type(encoder_hidden_states) == list:
+            if isinstance(encoder_hidden_states, list):
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
            else:
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if type(encoder_attention_mask) == list:
+            if isinstance(encoder_attention_mask, list):
                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
            elif encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -37,6 +37,7 @@ class Blip2Processor(ProcessorMixin):
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"
@ -141,8 +142,8 @@ class Blip2Processor(ProcessorMixin):
    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
    def decode(self, *args, **kwargs):
        """
-        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
-        to the docstring of this method for more information.
+        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@ -73,6 +73,7 @@ class BridgeTowerVisionConfig(PretrainedConfig):
    >>> # Accessing the configuration
    >>> configuration
    ```"""
    model_type = "bridgetower_vision_model"
    def __init__(
@ -179,6 +180,7 @@ class BridgeTowerTextConfig(PretrainedConfig):
    >>> # Accessing the configuration
    >>> configuration
    ```"""
    model_type = "bridgetower_text_model"
    def __init__(
@ -291,6 +293,7 @@ class BridgeTowerConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bridgetower"
    def __init__(
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@ -46,7 +46,7 @@ _TOKENIZER_FOR_DOC = "RobertaTokenizer"
 BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "BridgeTower/bridgetower-base",
-    "BridgeTower/bridgetower-base-itm-mlm"
+    "BridgeTower/bridgetower-base-itm-mlm",
    # See all bridgetower models at https://huggingface.co/BridgeTower
 ]
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@ -38,6 +38,7 @@ class BridgeTowerProcessor(ProcessorMixin):
        tokenizer (`RobertaTokenizerFast`):
            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BridgeTowerImageProcessor"
    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
--- a/src/transformers/models/bros/configuration_bros.py
+++ b/src/transformers/models/bros/configuration_bros.py
@ -90,6 +90,7 @@ class BrosConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "bros"
    def __init__(
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@ -34,6 +34,7 @@ class BrosProcessor(ProcessorMixin):
        tokenizer (`BertTokenizerFast`, *optional*):
            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
    """
    attributes = ["tokenizer"]
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@ -95,6 +95,7 @@ class CanineConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "canine"
    def __init__(
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@ -54,7 +54,7 @@ _CONFIG_FOR_DOC = "CanineConfig"
 CANINE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/canine-s",
-    "google/canine-r"
+    "google/canine-r",
    # See all CANINE models at https://huggingface.co/models?filter=canine
 ]
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@ -106,6 +106,7 @@ class ChineseCLIPTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "chinese_clip_text_model"
    def __init__(
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@ -718,9 +718,7 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        elif isinstance(module, ChineseCLIPVisionMLP):
            factor = self.config.initializer_factor
-            in_proj_std = (
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            )
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@ -36,6 +36,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
        tokenizer ([`BertTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ChineseCLIPImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@ -97,6 +97,7 @@ class ClapTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "clap_text_model"
    def __init__(
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@ -33,6 +33,7 @@ class ClapProcessor(ProcessorMixin):
        tokenizer ([`RobertaTokenizerFast`]):
            The tokenizer is a required input.
    """
    feature_extractor_class = "ClapFeatureExtractor"
    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@ -96,6 +96,7 @@ class CLIPTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "clip_text_model"
    def __init__(
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@ -421,9 +421,7 @@ class CLIPPreTrainedModel(PreTrainedModel):
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        elif isinstance(module, CLIPMLP):
            factor = self.config.initializer_factor
-            in_proj_std = (
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            )
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@ -35,6 +35,7 @@ class CLIPProcessor(ProcessorMixin):
        tokenizer ([`CLIPTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@ -86,6 +86,7 @@ class CLIPSegTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "clipseg_text_model"
    def __init__(
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@ -77,8 +77,7 @@ class CLIPSegOutput(ModelOutput):
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
            [`CLIPSegVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
@ -443,9 +442,7 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        elif isinstance(module, CLIPSegMLP):
            factor = self.config.initializer_factor
-            in_proj_std = (
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            )
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@ -35,6 +35,7 @@ class CLIPSegProcessor(ProcessorMixin):
        tokenizer ([`CLIPTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ViTImageProcessor"
    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@ -684,9 +684,7 @@ class ClvpPreTrainedModel(PreTrainedModel):
                module.bias.data.zero_()
        elif isinstance(module, ClvpEncoderMLP):
            factor = self.config.initializer_factor
-            in_proj_std = (
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            )
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.proj.weight if getattr(module.fc1, "proj") else module.fc1.weight, std=fc_std)
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@ -34,6 +34,7 @@ class ClvpProcessor(ProcessorMixin):
        tokenizer (`ClvpTokenizer`):
            An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
    """
    feature_extractor_class = "ClvpFeatureExtractor"
    tokenizer_class = "ClvpTokenizer"
    model_input_names = [
@ -76,15 +77,15 @@ class ClvpProcessor(ProcessorMixin):
    # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp
    def batch_decode(self, *args, **kwargs):
        """
-        This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
+        This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
-        to the docstring of this method for more information.
+        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
    # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.decode with Whisper->Clvp
    def decode(self, *args, **kwargs):
        """
-        This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
-        docstring of this method for more information.
+        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@ -105,6 +105,7 @@ class CodeGenConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "codegen"
    attribute_map = {
        "max_position_embeddings": "n_positions",
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@ -134,6 +134,7 @@ class ConditionalDetrConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "conditional_detr"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@ -478,8 +478,7 @@ def post_process_panoptic_sample(
    threshold=0.85,
 ) -> Dict:
    """
-    Converts the output of [`ConditionalDetrForSegmentation`] into panoptic segmentation predictions for a single
+    Converts the output of [`ConditionalDetrForSegmentation`] into panoptic segmentation predictions for a single sample.
    sample.
    Args:
        out_logits (`torch.Tensor`):
@ -1454,8 +1453,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_semantic_segmentation with Detr->ConditionalDetr
    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
        """
-        Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
        PyTorch.
        Args:
            outputs ([`ConditionalDetrForSegmentation`]):
@ -1511,8 +1509,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        return_coco_annotation: Optional[bool] = False,
    ) -> List[Dict]:
        """
-        Converts the output of [`ConditionalDetrForSegmentation`] into instance segmentation predictions. Only supports
+        Converts the output of [`ConditionalDetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
        PyTorch.
        Args:
            outputs ([`ConditionalDetrForSegmentation`]):
@ -1596,8 +1593,8 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
        target_sizes: Optional[List[Tuple[int, int]]] = None,
    ) -> List[Dict]:
        """
-        Converts the output of [`ConditionalDetrForSegmentation`] into image panoptic segmentation predictions. Only
+        Converts the output of [`ConditionalDetrForSegmentation`] into image panoptic segmentation predictions. Only supports
-        supports PyTorch.
+        PyTorch.
        Args:
            outputs ([`ConditionalDetrForSegmentation`]):
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@ -153,8 +153,8 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve
+            possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
-            the unnormalized bounding boxes.
+            unnormalized bounding boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
@ -217,14 +217,14 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
-            possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve
+            possible padding). You can use [`~ConditionalDetrImageProcessor.post_process_object_detection`] to retrieve the
-            the unnormalized bounding boxes.
+            unnormalized bounding boxes.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
            Segmentation masks logits for all queries. See also
            [`~ConditionalDetrImageProcessor.post_process_semantic_segmentation`] or
            [`~ConditionalDetrImageProcessor.post_process_instance_segmentation`]
-            [`~ConditionalDetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and
+            [`~ConditionalDetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
-            panoptic segmentation masks respectively.
+            segmentation masks respectively.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@ -96,6 +96,7 @@ class ConvBertConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "convbert"
    def __init__(
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@ -263,8 +263,8 @@ class ConvBertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
-        sequence pair mask has the following format:
+        pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@ -168,8 +168,8 @@ class ConvBertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
-        sequence pair mask has the following format:
+        pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@ -87,6 +87,7 @@ class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "convnext"
    def __init__(
--- a/src/transformers/models/convnextv2/configuration_convnextv2.py
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@ -79,6 +79,7 @@ class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "convnextv2"
    def __init__(
--- a/src/transformers/models/cpmant/configuration_cpmant.py
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@ -84,6 +84,7 @@ class CpmAntConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "cpmant"
    def __init__(
--- a/src/transformers/models/cvt/configuration_cvt.py
+++ b/src/transformers/models/cvt/configuration_cvt.py
@ -96,6 +96,7 @@ class CvtConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "cvt"
    def __init__(
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@ -168,6 +168,7 @@ class Data2VecAudioConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "data2vec-audio"
    def __init__(
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@ -95,6 +95,7 @@ class Data2VecTextConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "data2vec-text"
    def __init__(
--- a/src/transformers/models/data2vec/configuration_data2vec_vision.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py
@ -111,6 +111,7 @@ class Data2VecVisionConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "data2vec-vision"
    def __init__(
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@ -289,8 +289,8 @@ class Data2VecVisionSelfAttention(nn.Module):
 # Copied from transformers.models.beit.modeling_beit.BeitSelfOutput with Beit->Data2VecVision
 class Data2VecVisionSelfOutput(nn.Module):
    """
-    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due
+    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
-    to the layernorm applied before each block.
+    layernorm applied before each block.
    """
    def __init__(self, config: Data2VecVisionConfig) -> None:
--- a/Show More
+++ b/Show More