fix review comments

Signed-off-by: root <root@a4bf01945cfe.jf.intel.com>
2025-07-31 02:02:21 +06:00 · 2025-03-17 19:52:19 -07:00 · 2025-03-17 19:52:19 -07:00 · acf1484b86
commit acf1484b86
parent fa15c53d22
171 changed files with 668 additions and 663 deletions
--- a/examples/flax/language-modeling/run_bert_flax.py
+++ b/examples/flax/language-modeling/run_bert_flax.py
@ -53,4 +53,4 @@ for _ in range(nbenchmark):
    func()
 end = time.time()
 print(end - start)
-print(f"Throughput: {((nbenchmark * BS)/(end-start)):.3f} examples/sec")
+print(f"Throughput: {((nbenchmark * BS) / (end - start)):.3f} examples/sec")
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@ -231,9 +231,9 @@ def main():

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            data_args.tgt_lang is not None and data_args.src_lang is not None
-        ), "mBart requires --tgt_lang and --src_lang"
+        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
+            "mBart requires --tgt_lang and --src_lang"
+        )
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
        else:
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@ -128,7 +128,7 @@ def run_search():

    results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
    print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
-    print(" | ".join([f"{'-'*col_widths[col]}" for col in col_names]))
+    print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
    for row in results_sorted:
        print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))

--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@ -282,9 +282,9 @@ class Seq2SeqDataCollator:
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        self.decoder_start_token_id = decoder_start_token_id
-        assert (
-            self.pad_token_id is not None
-        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        assert self.pad_token_id is not None, (
+            f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        )
        self.data_args = data_args
        self.tpu_num_cores = tpu_num_cores
        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
@ -593,7 +593,7 @@ def assert_all_frozen(model):
    model_grads: List[bool] = list(grad_status(model))
    n_require_grad = sum(lmap(int, model_grads))
    npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+    assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"


 def assert_not_all_frozen(model):
--- a/examples/legacy/token-classification/tasks.py
+++ b/examples/legacy/token-classification/tasks.py
@ -131,7 +131,7 @@ class POS(TokenClassificationTask):
            s_p = preds_list[example_id]
            out = ""
            for token in sentence:
-                out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
+                out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) "
            out += "\n"
            writer.write(out)
            example_id += 1
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -534,7 +534,7 @@ class Multimodal2VisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -438,7 +438,7 @@ def main():
    else:
        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
    # on a small vocab and want a smaller embedding size, remove this test.
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -265,8 +265,7 @@ class DataTrainingArguments:
        default="<fim_pad>",
        metadata={
            "help": (
-                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. "
-                "Defaults to '<fim_pad>'."
+                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."
            )
        },
    )
@ -514,7 +513,7 @@ def main():
            attn_implementation=model_args.attn_implementation,
        )
        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

    # Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
    special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -234,9 +234,7 @@ def parse_args():
        "--fim_pad_token",
        type=str,
        default="<fim_pad>",
-        help=(
-            "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True." " Defaults to '<fim_pad>'."
-        ),
+        help=("Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."),
    )
    parser.add_argument(
        "--preprocessing_num_workers",
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -491,7 +491,7 @@ def main():
    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
    # that could be easily picked up by the model
    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
    )
    text_column_name = data_args.text_column_name

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -471,7 +471,7 @@ def main():
    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
    # that could be easily picked up by the model
    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
    )
    text_column_name = data_args.text_column_name

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -505,9 +505,9 @@ def main():
        return

    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
-            data_args.lang is not None
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        assert data_args.lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        )

        tokenizer.src_lang = data_args.lang
        tokenizer.tgt_lang = data_args.lang
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -199,9 +199,9 @@ class DataTrainingArguments:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )


@dataclass
@ -357,9 +357,9 @@ def main():
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                data_files["test"] = data_args.test_file
            else:
                raise ValueError("Need either a dataset name or a test file for `do_predict`.")
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -156,9 +156,9 @@ class DataTrainingArguments:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )


@dataclass
@ -313,9 +313,9 @@ def main():
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                data_files["test"] = data_args.test_file
            else:
                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@ -322,7 +322,7 @@ def main():
    parser.add_argument(
        "--use_cpu",
        action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
    )
    parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
    parser.add_argument(
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@ -68,7 +68,7 @@ def main():
    parser.add_argument(
        "--use_cpu",
        action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
    )
    parser.add_argument(
        "--fp16",
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -436,9 +436,9 @@ def main():

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            args.target_lang is not None and args.source_lang is not None
-        ), "mBart requires --target_lang and --source_lang"
+        assert args.target_lang is not None and args.source_lang is not None, (
+            "mBart requires --target_lang and --source_lang"
+        )
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
        else:
--- a/examples/run_on_remote.py
+++ b/examples/run_on_remote.py
@ -56,7 +56,7 @@ if __name__ == "__main__":
    cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])

    # Run example. You can bypass the CLI wrapper and paste your own code here.
-    cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
+    cluster.run([f"python transformers/examples/{args.example} {' '.join(shlex.quote(arg) for arg in unknown)}"])

    # Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
    # from my_script... import train
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@ -501,9 +501,9 @@ def main():

        # region Set decoder_start_token_id
        if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-            assert (
-                data_args.target_lang is not None and data_args.source_lang is not None
-            ), "mBart requires --target_lang and --source_lang"
+            assert data_args.target_lang is not None and data_args.source_lang is not None, (
+                "mBart requires --target_lang and --source_lang"
+            )
            if isinstance(tokenizer, MBartTokenizer):
                model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
            else:
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@ -167,9 +167,9 @@ class Tool:
                )
        for input_name, input_content in self.inputs.items():
            assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary."
-            assert (
-                "type" in input_content and "description" in input_content
-            ), f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
+            assert "type" in input_content and "description" in input_content, (
+                f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
+            )
            if input_content["type"] not in authorized_types:
                raise Exception(
                    f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -9,8 +9,7 @@ import torch
 from packaging import version

 from .configuration_utils import PretrainedConfig
-from .pytorch_utils import is_torch_greater_or_equal_than_2_7
-from .utils import is_hqq_available, is_optimum_quanto_available, logging
+from .utils import is_hqq_available, is_optimum_quanto_available, is_torch_greater_or_equal, logging


 if is_hqq_available():
@ -550,23 +549,23 @@ class OffloadedCache(DynamicCache):
    """

    def __init__(self) -> None:
-        if not (torch.cuda.is_available() or (is_torch_greater_or_equal_than_2_7 and torch.xpu.is_available())):
+        if not (torch.cuda.is_available() or (is_torch_greater_or_equal("2.7.0") and torch.xpu.is_available())):
            raise RuntimeError(
                "OffloadedCache can only be used with a GPU"
-                + (" or XPU" if is_torch_greater_or_equal_than_2_7 else "")
+                + (" or XPU" if is_torch_greater_or_equal("2.7.0") else "")
            )

        super().__init__()
        self.original_device = []
        self.prefetch_stream = None
-        self.prefetch_stream = torch.Stream() if is_torch_greater_or_equal_than_2_7 else torch.cuda.Stream()
+        self.prefetch_stream = torch.Stream() if is_torch_greater_or_equal("2.7.0") else torch.cuda.Stream()
        self.beam_idx = None  # used to delay beam search operations

    def prefetch_layer(self, layer_idx: int):
        "Starts prefetching the next layer cache"
        if layer_idx < len(self):
            with (
-                self.prefetch_stream if is_torch_greater_or_equal_than_2_7 else torch.cuda.stream(self.prefetch_stream)
+                self.prefetch_stream if is_torch_greater_or_equal("2.7.0") else torch.cuda.stream(self.prefetch_stream)
            ):
                # Prefetch next layer tensors to GPU
                device = self.original_device[layer_idx]
@ -585,7 +584,7 @@ class OffloadedCache(DynamicCache):
        "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
        if layer_idx < len(self):
            # Evict the previous layer if necessary
-            if is_torch_greater_or_equal_than_2_7:
+            if is_torch_greater_or_equal("2.7.0"):
                torch.accelerator.current_stream().synchronize()
            else:
                torch.cuda.current_stream().synchronize()
--- a/src/transformers/commands/add_fast_image_processor.py
+++ b/src/transformers/commands/add_fast_image_processor.py
@ -288,7 +288,7 @@ def add_fast_image_processor_to_dummy(fast_image_processor_name: str):
    if new_dummy_object not in content:
        if index_new != len(image_processor_names) - 1:
            # add the dummy object just before the next ImageProcessorFast
-            first_line = f"class {image_processor_names[index_new+1]}(metaclass=DummyObject):"
+            first_line = f"class {image_processor_names[index_new + 1]}(metaclass=DummyObject):"
            updated_content = content.replace(first_line, new_dummy_object + "\n\n" + first_line)
        else:
            # add the dummy object at the very end
@ -313,11 +313,9 @@ def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name:
        raise ValueError(f"No doc files found for {model_name}")

    base_doc_string = (
-        f"## {fast_image_processor_name[:-4]}\n\n" f"[[autodoc]] {fast_image_processor_name[:-4]}\n" "    - preprocess"
-    )
-    fast_doc_string = (
-        f"## {fast_image_processor_name}\n\n" f"[[autodoc]] {fast_image_processor_name}\n" "    - preprocess"
+        f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n    - preprocess"
    )
+    fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n    - preprocess"

    for doc_file in doc_files:
        with open(doc_file, "r", encoding="utf-8") as f:
@ -385,7 +383,7 @@ def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name
    # add the fast image processor to the imports
    base_import_string = f"    from transformers import {fast_image_processor_name[:-4]}"
    fast_import_string = (
-        "    if is_torchvision_available():\n" f"        from transformers import {fast_image_processor_name}"
+        f"    if is_torchvision_available():\n        from transformers import {fast_image_processor_name}"
    )
    if fast_import_string not in updated_content:
        updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
@ -546,17 +544,17 @@ def add_fast_image_processor_file(
        "    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
        "    # Default values should be checked against the slow image processor\n"
        "    # None values left after checking can be removed\n"
-        f'    resample = {default_args_dict.get("resample")}\n'
-        f'    image_mean = {default_args_dict.get("image_mean")}\n'
-        f'    image_std = {default_args_dict.get("image_std")}\n'
-        f'    size = {default_args_dict.get("size")}\n'
-        f'    default_to_square = {default_args_dict.get("default_to_square")}\n'
-        f'    crop_size = {default_args_dict.get("crop_size")}\n'
-        f'    do_resize = {default_args_dict.get("do_resize")}\n'
-        f'    do_center_crop = {default_args_dict.get("do_center_crop")}\n'
-        f'    do_rescale = {default_args_dict.get("do_rescale")}\n'
-        f'    do_normalize = {default_args_dict.get("do_normalize")}\n'
-        f'    do_convert_rgb = {default_args_dict.get("do_convert_rgb")}\n\n\n'
+        f"    resample = {default_args_dict.get('resample')}\n"
+        f"    image_mean = {default_args_dict.get('image_mean')}\n"
+        f"    image_std = {default_args_dict.get('image_std')}\n"
+        f"    size = {default_args_dict.get('size')}\n"
+        f"    default_to_square = {default_args_dict.get('default_to_square')}\n"
+        f"    crop_size = {default_args_dict.get('crop_size')}\n"
+        f"    do_resize = {default_args_dict.get('do_resize')}\n"
+        f"    do_center_crop = {default_args_dict.get('do_center_crop')}\n"
+        f"    do_rescale = {default_args_dict.get('do_rescale')}\n"
+        f"    do_normalize = {default_args_dict.get('do_normalize')}\n"
+        f"    do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n"
        f'__all__ = ["{fast_image_processor_name}"]\n'
    )

--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -1587,7 +1587,7 @@ class TikTokenConverter:
            from tiktoken.load import load_tiktoken_bpe
        except Exception:
            raise ValueError(
-                "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
+                "`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`."
            )

        bpe_ranks = load_tiktoken_bpe(tiktoken_url)
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@ -206,7 +206,7 @@ class DebugUnderflowOverflow:
        self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")

    def batch_end_frame(self):
-        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")
+        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number - 1} ***\n\n")

    def create_frame(self, module, input, output):
        self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -2749,9 +2749,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
            ngram keys (batch_size, num_ngrams, depth).
        """
        if len(ngrams.shape) != 3:
-            raise ValueError(
-                "Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}"
-            )
+            raise ValueError(f"Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but is {ngrams.shape}")
        if ngrams.shape[2] != self.ngram_len:
            raise ValueError(
                "Ngrams should be of shape (batch_size, num_ngrams, ngram_len),"
@ -2836,7 +2834,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
    def _check_input_ids_shape(self, input_ids: torch.LongTensor):
        """Checks the shape of input ids."""
        if len(input_ids.shape) != 2:
-            raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}")
+            raise ValueError(f"Input ids should be of shape (batch_size, input_len), but is {input_ids.shape}")

    def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor:
        """
--- a/src/transformers/integrations/tpu.py
+++ b/src/transformers/integrations/tpu.py
@ -21,9 +21,9 @@ def tpu_spmd_dataloader(dataloader: DataLoader):
    if is_torch_xla_available():
        import torch_xla.distributed.parallel_loader as pl

-        assert isinstance(
-            dataloader, pl.MpDeviceLoader
-        ), "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
+        assert isinstance(dataloader, pl.MpDeviceLoader), (
+            "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
+        )

        # This is to support PyTorch/XLA FSDP via SPMD.
        # Here we shard the input data's 0th dim across the fsdp axis.
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@ -153,7 +153,7 @@ def flax_shard_checkpoint(params, max_shard_size="10GB"):
    weight_map = {}
    shards = {}
    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.msgpack")
+        shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.msgpack")
        shards[shard_file] = shard
        for weight_name in shard.keys():
            weight_map[weight_name] = shard_file
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@ -701,7 +701,7 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB", weights_name: str = TF2_
    weight_map = {}
    shards = {}
    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".h5", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.h5")
+        shard_file = weights_name.replace(".h5", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.h5")
        shard_file = shard_file.replace(
            ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
        )
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -2486,9 +2486,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            total_decoder_name="",
            total_encoder_name="",
        ):
-            assert isinstance(decoder_pointer, nn.Module) and isinstance(
-                encoder_pointer, nn.Module
-            ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
+            assert isinstance(decoder_pointer, nn.Module) and isinstance(encoder_pointer, nn.Module), (
+                f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
+            )
            if hasattr(decoder_pointer, "weight"):
                assert hasattr(encoder_pointer, "weight")
                encoder_pointer.weight = decoder_pointer.weight
@ -2502,9 +2502,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            encoder_modules = encoder_pointer._modules
            decoder_modules = decoder_pointer._modules
            if len(decoder_modules) > 0:
-                assert (
-                    len(encoder_modules) > 0
-                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+                assert len(encoder_modules) > 0, (
+                    f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+                )

                all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
                encoder_layer_pos = 0
@ -5412,9 +5412,9 @@ class PoolerEndLogits(nn.Module):
        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
        if start_positions is not None:
            slen, hsz = hidden_states.shape[-2:]
            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
@ -5480,9 +5480,9 @@ class PoolerAnswerClass(nn.Module):
        """
        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
        hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
        if start_positions is not None:
            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@ -1058,7 +1058,7 @@ class AltCLIPVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/bark/convert_suno_to_hf.py
+++ b/src/transformers/models/bark/convert_suno_to_hf.py
@ -150,7 +150,7 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"):
    model.load_state_dict(state_dict, strict=False)
    n_params = model.num_parameters(exclude_embeddings=True)
    val_loss = checkpoint["best_val_loss"].item()
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params, {round(val_loss, 3)} loss")
    model.eval()
    model.to(device)
    del checkpoint, state_dict
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@ -103,7 +103,7 @@ class BarkProcessor(ProcessorMixin):
            )
            if speaker_embeddings_path is None:
                logger.warning(
-                    f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
+                    f"""`{os.path.join(pretrained_processor_name_or_path, speaker_embeddings_dict_path)}` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
                )
@ -202,7 +202,7 @@ class BarkProcessor(ProcessorMixin):
            )
            if path is None:
                raise ValueError(
-                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"),voice_preset_paths[key])}` does not exists
+                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
                    embeddings."""
                )
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@ -329,7 +329,7 @@ class BridgeTowerVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@ -234,7 +234,7 @@ class ChineseCLIPVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@ -74,7 +74,7 @@ def rename_state_dict(state_dict):
            # replace sequential layers with list
            sequential_layer = re.match(sequential_layers_pattern, key).group(1)

-            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer) // 3}.linear.")
        elif re.match(text_projection_pattern, key):
            projecton_layer = int(re.match(text_projection_pattern, key).group(1))

--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@ -242,7 +242,7 @@ class CLIPVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@ -209,7 +209,7 @@ class CLIPSegVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
--- a/src/transformers/models/clvp/configuration_clvp.py
+++ b/src/transformers/models/clvp/configuration_clvp.py
@ -144,7 +144,7 @@ class ClvpEncoderConfig(PretrainedConfig):
        # this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
        if config_type not in cls.base_config_key:
            raise ValueError(
-                f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
+                f"We can only load either 'text_config' or 'speech_config' but you are trying to load{config_type}"
            )

        # get the text config dict if we are loading from ClvpConfig
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@ -190,8 +190,8 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
--- a/src/transformers/models/cohere/tokenization_cohere_fast.py
+++ b/src/transformers/models/cohere/tokenization_cohere_fast.py
@ -198,8 +198,8 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@ -127,9 +127,9 @@ def convert_data2vec_checkpoint_to_pytorch(

        # self-attention output
        self_output: BertSelfOutput = layer.attention.output
-        assert (
-            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
-        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
+        assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, (
+            f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
+        )
        self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
        self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
        self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
@ -137,17 +137,17 @@ def convert_data2vec_checkpoint_to_pytorch(

        # intermediate
        intermediate: BertIntermediate = layer.intermediate
-        assert (
-            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
-        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
+        assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, (
+            f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
+        )
        intermediate.dense.weight = data2vec_layer.fc1.weight
        intermediate.dense.bias = data2vec_layer.fc1.bias

        # output
        bert_output: BertOutput = layer.output
-        assert (
-            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
-        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
+        assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, (
+            f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
+        )
        bert_output.dense.weight = data2vec_layer.fc2.weight
        bert_output.dense.bias = data2vec_layer.fc2.bias
        bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@ -1491,7 +1491,7 @@ class TFData2VecVisionFCNHead(keras.layers.Layer):
                    kernel_size=kernel_size,
                    padding="same",
                    dilation=dilation,
-                    name=f"conv_module_{i+2}",
+                    name=f"conv_module_{i + 2}",
                )
            )
        if self.num_convs == 0:
--- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
@ -180,9 +180,9 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f
        gluon_param = to_torch(params[gluon_param])
        shape_gluon = gluon_param.shape

-        assert (
-            shape_hf == shape_gluon
-        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
+        assert shape_hf == shape_gluon, (
+            f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
+        )

        return gluon_param

--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
@ -200,7 +200,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
        # handle missmatched shape
        elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
            val = model_state_dict[f"{key_prefix}.{key}"]
-            print(f"{original_key}-> {key} : \nshape {val.shape} and { value.shape}, do not match")
+            print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match")
            key = original_key

        mapping[key] = original_key
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@ -2366,7 +2366,7 @@ class JukeboxModel(JukeboxPreTrainedModel):
        new_tokens = sample_tokens - previous_sampled_tokens.shape[1]

        logger.info(
-            f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
+            f"Sampling {sample_tokens} tokens for [{start},{start + sample_tokens}]. Conditioning on"
            f" {conditioning_tokens} tokens"
        )

@ -2390,7 +2390,7 @@ class JukeboxModel(JukeboxPreTrainedModel):
            name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
            iterator.set_description(
                f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of"
-                f" {self.total_length//prior.raw_to_tokens}",
+                f" {self.total_length // prior.raw_to_tokens}",
                refresh=True,
            )
            tokens_i = prior.sample(
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@ -154,7 +154,7 @@ class OpenLlamaConfig(PretrainedConfig):

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
--- a/src/transformers/models/deprecated/realm/modeling_realm.py
+++ b/src/transformers/models/deprecated/realm/modeling_realm.py
@ -139,9 +139,9 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
        elif m_name == "kernel":
            array = np.transpose(array)
        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
--- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@ -579,7 +579,7 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
-                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" " False`..."
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
                )
                use_cache = False

--- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@ -1095,9 +1095,9 @@ class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenc
                batch_size, sequence_length = shape_list(input_ids)[:2]
            else:
                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
-            assert (
-                self.config.pad_token_id is not None or batch_size == 1
-            ), "Cannot handle batch sizes > 1 if no padding token is defined."
+            assert self.config.pad_token_id is not None or batch_size == 1, (
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )

            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0:batch_size, sequence_lengths]
--- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@ -155,9 +155,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
                p_i.data = torch.from_numpy(arr_i)
        else:
            try:
-                assert (
-                    pointer.shape == array.shape
-                ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
            except AssertionError as e:
                e.args += (pointer.shape, array.shape)
                raise
@ -1238,9 +1238,9 @@ class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

-        assert (
-            self.config.pad_token_id is not None or batch_size == 1
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        assert self.config.pad_token_id is not None or batch_size == 1, (
+            "Cannot handle batch sizes > 1 if no padding token is defined."
+        )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
--- a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
@ -588,9 +588,9 @@ class XLMProphetNetPositionalEmbeddings(nn.Embedding):
        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)

    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
-        assert (position_ids is None) or (
-            self.padding_idx is None
-        ), "If position_ids is pre-computed then padding_idx should not be set."
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )

        if position_ids is None:
            if past_key_values is not None:
@ -784,9 +784,9 @@ class XLMProphetNetNgramSelfAttention(nn.Module):
        self.head_dim = config.hidden_size // self.num_attn_heads
        self.ngram = config.ngram

-        assert (
-            self.head_dim * self.num_attn_heads == config.hidden_size
-        ), "config.hidden_size must be divisible by num_attn_heads"
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
        # key, value, query projection
        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
@ -1041,9 +1041,9 @@ class XLMProphetNetNgramSelfAttention(nn.Module):

        if predict_relative_position_buckets is None:
            key_sequence_length = attn_weights.shape[-1]
-            assert (
-                position_ids[0][0] == key_sequence_length - 1
-            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
            relative_positions = (
                torch.arange(0, key_sequence_length)
                .unsqueeze(0)
@ -1313,9 +1313,9 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_hidden_states = encoder_hidden_states + (hidden_states,)
@ -1488,9 +1488,9 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):

        # prepare attention mask
        if past_key_values is not None:
-            assert (
-                hidden_states.size(1) == 1
-            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )

            ngram_hidden_states = [
                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@ -114,7 +114,7 @@ class DepthProConfig(PretrainedConfig):
        # scaled_images_ratios is sorted
        if scaled_images_ratios != sorted(scaled_images_ratios):
            raise ValueError(
-                f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
+                f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
            )

        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@ -275,9 +275,9 @@ class FlaxTransformerBlock(nn.Module):
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
-        assert (
-            self.config.dim % self.config.n_heads == 0
-        ), f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
+        assert self.config.dim % self.config.n_heads == 0, (
+            f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
+        )

        self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)
        self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@ -269,9 +269,9 @@ class TFTransformerBlock(keras.layers.Layer):
        self.activation = config.activation
        self.output_attentions = config.output_attentions

-        assert (
-            config.dim % config.n_heads == 0
-        ), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
+        assert config.dim % config.n_heads == 0, (
+            f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
+        )

        self.attention = TFMultiHeadSelfAttention(config, name="attention")
        self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
--- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
@ -137,7 +137,7 @@ if __name__ == "__main__":
    dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
    dest_dir = Path(dest_dir)
    assert src_file.exists()
-    assert (
-        args.type is not None
-    ), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
+    assert args.type is not None, (
+        "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
+    )
    convert(args.type, src_file, dest_dir)
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@ -170,9 +170,9 @@ class CustomDPRReaderTokenizerMixin:
        texts = texts if not isinstance(texts, str) else [texts]
        n_passages = len(titles)
        questions = questions if not isinstance(questions, str) else [questions] * n_passages
-        assert len(titles) == len(
-            texts
-        ), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+        assert len(titles) == len(texts), (
+            f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+        )
        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
        encoded_inputs = {
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@ -119,7 +119,7 @@ def rename_key(name):
    if "refinenet" in name:
        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
+        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}")
    if "out_conv" in name:
        name = name.replace("out_conv", "projection")
    if "resConfUnit1" in name:
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@ -107,7 +107,7 @@ def rename_key(name):
    if "refinenet" in name:
        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
+        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}")
    if "out_conv" in name:
        name = name.replace("out_conv", "projection")
    if "resConfUnit1" in name:
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@ -617,8 +617,7 @@ class EncodecModel(EncodecPreTrainedModel):
            bandwidth = self.config.target_bandwidths[0]
        if bandwidth not in self.config.target_bandwidths:
            raise ValueError(
-                f"This model doesn't support the bandwidth {bandwidth}. "
-                f"Select one of {self.config.target_bandwidths}."
+                f"This model doesn't support the bandwidth {bandwidth}. Select one of {self.config.target_bandwidths}."
            )

        _, channels, input_length = input_values.shape
--- a/src/transformers/models/esm/openfold_utils/residue_constants.py
+++ b/src/transformers/models/esm/openfold_utils/residue_constants.py
@ -399,13 +399,11 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis


@functools.lru_cache(maxsize=None)
-def load_stereo_chemical_props() -> (
-    Tuple[
-        Mapping[str, List[Bond]],
-        Mapping[str, List[Bond]],
-        Mapping[str, List[BondAngle]],
-    ]
-):
+def load_stereo_chemical_props() -> Tuple[
+    Mapping[str, List[Bond]],
+    Mapping[str, List[Bond]],
+    Mapping[str, List[BondAngle]],
+]:
    """Load stereo_chemical_props.txt into a nice structure.

    Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@ -1495,9 +1495,9 @@ class FlavaImageCodebookLayerGroup(nn.Module):
        blocks = OrderedDict()
        for i in range(num_blocks):
            if i == 0:
-                blocks[f"block_{i+1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
            else:
-                blocks[f"block_{i+1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)

        if use_pool:
            blocks["pool"] = nn.MaxPool2d(kernel_size=2)
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@ -539,9 +539,9 @@ class FSMTEncoder(nn.Module):
        all_attentions = () if output_attentions else None
        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                x = x.transpose(0, 1)  # T x B x C -> B x T x C
@ -960,9 +960,9 @@ class Attention(nn.Module):
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@ -113,9 +113,9 @@ class FunnelConfig(PretrainedConfig):
        self.vocab_size = vocab_size
        self.block_sizes = block_sizes
        self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
-        assert len(block_sizes) == len(
-            self.block_repeats
-        ), "`block_sizes` and `block_repeats` should have the same length."
+        assert len(block_sizes) == len(self.block_repeats), (
+            "`block_sizes` and `block_repeats` should have the same length."
+        )
        self.num_decoder_layers = num_decoder_layers
        self.d_model = d_model
        self.n_head = n_head
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@ -195,7 +195,7 @@ class FuyuConfig(PretrainedConfig):

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
--- a/src/transformers/models/gemma/tokenization_gemma_fast.py
+++ b/src/transformers/models/gemma/tokenization_gemma_fast.py
@ -136,8 +136,8 @@ class GemmaTokenizerFast(PreTrainedTokenizerFast):
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@ -683,7 +683,7 @@ class GitVisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@ -40,13 +40,13 @@ def rename_keys(state_dict):
        if "patch_embed" in key:
            # replace for example patch_embed1 by patch_embeddings.0
            idx = key[key.find("patch_embed") + len("patch_embed")]
-            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
+            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx) - 1}")
        if "norm" in key:
            key = key.replace("norm", "layer_norm")
        if "glpn.encoder.layer_norm" in key:
            # replace for example layer_norm1 by layer_norm.0
            idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")]
-            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
+            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx) - 1}")
        if "layer_norm1" in key:
            key = key.replace("layer_norm1", "layer_norm_1")
        if "layer_norm2" in key:
@ -54,7 +54,7 @@ def rename_keys(state_dict):
        if "block" in key:
            # replace for example block1 by block.0
            idx = key[key.find("block") + len("block")]
-            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
+            key = key.replace(f"block{idx}", f"block.{int(idx) - 1}")
        if "attn.q" in key:
            key = key.replace("attn.q", "attention.self.query")
        if "attn.proj" in key:
@ -73,7 +73,7 @@ def rename_keys(state_dict):
        if "linear_c" in key:
            # replace for example linear_c4 by linear_c.3
            idx = key[key.find("linear_c") + len("linear_c")]
-            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
+            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx) - 1}")
        if "bot_conv" in key:
            key = key.replace("bot_conv", "0.convolution")
        if "skip_conv1" in key:
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@ -154,8 +154,8 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@ -587,7 +587,7 @@ class TFHubertFeatureEncoder(keras.layers.Layer):

        if config.feat_extract_norm == "group":
            conv_layers = [TFHubertGroupNormConvLayer(config, layer_id=0, name=f"conv_layers.{0}")] + [
-                TFHubertNoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i+1}")
+                TFHubertNoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i + 1}")
                for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@ -171,9 +171,9 @@ class QuantAct(nn.Module):
            x_min = x_act.data.min()
            x_max = x_act.data.max()

-            assert (
-                x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
-            ), "NaN detected when computing min/max of the activation"
+            assert x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0, (
+                "NaN detected when computing min/max of the activation"
+            )

            # Initialization
            if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@ -451,7 +451,7 @@ class Kosmos2VisionEmbeddings(nn.Module):
        batch_size, _, height, width = pixel_values.shape
        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
            )
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@ -101,8 +101,7 @@ class LayoutXLMProcessor(ProcessorMixin):
        # verify input
        if self.image_processor.apply_ocr and (boxes is not None):
            raise ValueError(
-                "You cannot provide bounding boxes "
-                "if you initialized the image processor with apply_ocr set to True."
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
            )

        if self.image_processor.apply_ocr and (word_labels is not None):
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@ -130,12 +130,12 @@ class LEDEncoderSelfAttention(nn.Module):

        self.layer_id = layer_id
        attention_window = config.attention_window[self.layer_id]
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )

        self.one_sided_attn_window_size = attention_window // 2

@ -169,9 +169,9 @@ class LEDEncoderSelfAttention(nn.Module):
        value_vectors = self.value(hidden_states)

        seq_len, batch_size, embed_dim = hidden_states.size()
-        assert (
-            embed_dim == self.embed_dim
-        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        assert embed_dim == self.embed_dim, (
+            f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        )

        # normalize query
        query_vectors /= math.sqrt(self.head_dim)
@ -239,9 +239,9 @@ class LEDEncoderSelfAttention(nn.Module):
        )  # use fp32 for numerical stability

        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
            attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs

        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
@ -433,9 +433,9 @@ class LEDEncoderSelfAttention(nn.Module):
        overlap of size window_overlap
        """
        batch_size, seq_len, num_heads, head_dim = query.size()
-        assert (
-            seq_len % (window_overlap * 2) == 0
-        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        assert seq_len % (window_overlap * 2) == 0, (
+            f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        )
        assert query.size() == key.size()

        chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
@ -706,9 +706,9 @@ class LEDEncoderSelfAttention(nn.Module):

        # apply layer head masking
        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
            global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
                batch_size, self.num_heads, max_num_global_attn_indices, seq_len
            )
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@ -182,12 +182,12 @@ class TFLEDEncoderSelfAttention(keras.layers.Layer):
        self.layer_id = layer_id
        attention_window = config.attention_window[self.layer_id]

-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )

        self.one_sided_attn_window_size = attention_window // 2

--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@ -192,8 +192,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
        if eos is None and self.add_eos_token:
            raise ValueError("add_eos_token = True but eos_token = None")

-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"

        special_tokens = []
        if self.add_bos_token:
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@ -510,12 +510,12 @@ class LongformerSelfAttention(nn.Module):

        self.layer_id = layer_id
        attention_window = config.attention_window[self.layer_id]
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )

        self.one_sided_attn_window_size = attention_window // 2

@ -549,9 +549,9 @@ class LongformerSelfAttention(nn.Module):
        value_vectors = self.value(hidden_states)

        seq_len, batch_size, embed_dim = hidden_states.size()
-        assert (
-            embed_dim == self.embed_dim
-        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        assert embed_dim == self.embed_dim, (
+            f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        )

        # normalize query
        query_vectors /= math.sqrt(self.head_dim)
@ -619,9 +619,9 @@ class LongformerSelfAttention(nn.Module):
        )  # use fp32 for numerical stability

        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
            attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs

        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
@ -813,9 +813,9 @@ class LongformerSelfAttention(nn.Module):
        overlap of size window_overlap
        """
        batch_size, seq_len, num_heads, head_dim = query.size()
-        assert (
-            seq_len % (window_overlap * 2) == 0
-        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        assert seq_len % (window_overlap * 2) == 0, (
+            f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        )
        assert query.size() == key.size()

        chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
@ -1086,9 +1086,9 @@ class LongformerSelfAttention(nn.Module):

        # apply layer head masking
        if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
            global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
                batch_size, self.num_heads, max_num_global_attn_indices, seq_len
            )
@ -1287,9 +1287,9 @@ class LongformerEncoder(nn.Module):

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layer)
-            ), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layer)), (
+                f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
+            )
        for idx, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
@ -1590,8 +1590,7 @@ class LongformerModel(LongformerPreTrainedModel):
        # this path should be recorded in the ONNX export, it is fine with padding_len == 0 as well
        if padding_len > 0:
            logger.warning_once(
-                f"Input ids are automatically padded to be a multiple of "
-                f"`config.attention_window`: {attention_window}"
+                f"Input ids are automatically padded to be a multiple of `config.attention_window`: {attention_window}"
            )
            if input_ids is not None:
                input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@ -746,12 +746,12 @@ class TFLongformerSelfAttention(keras.layers.Layer):
        self.layer_id = layer_id
        attention_window = config.attention_window[self.layer_id]

-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )

        self.one_sided_attn_window_size = attention_window // 2

--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@ -1294,7 +1294,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

--- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
@ -228,7 +228,7 @@ class TatoebaConverter:
        # combine with Tatoeba markdown
        readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
        extra_markdown = f"""
-### {model_dict['_name']}
+### {model_dict["_name"]}

 * source language name: {self.tag2name[a3_src]}
 * target language name: {self.tag2name[a3_tgt]}
@ -237,12 +237,12 @@ class TatoebaConverter:

        content = (
            f"""
-* model: {model_dict['modeltype']}
-* source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)}
-* target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)}
+* model: {model_dict["modeltype"]}
+* source language code{src_multilingual * "s"}: {", ".join(a2_src_tags)}
+* target language code{tgt_multilingual * "s"}: {", ".join(a2_tgt_tags)}
 * dataset: opus {backtranslated_data}
-* release date: {model_dict['release-date']}
-* pre-processing: {model_dict['pre-processing']}
+* release date: {model_dict["release-date"]}
+* pre-processing: {model_dict["pre-processing"]}
 """
            + multilingual_data
            + tuned
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@ -741,9 +741,9 @@ class MarianEncoder(MarianPreTrainedModel):

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@ -523,11 +523,11 @@ class OriginalMask2FormerCheckpointToOursConverter:
                [
                    (
                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.weight",
                    ),
                    (
                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.bias",
                    ),
                ]
            )
@ -863,9 +863,9 @@ def test(
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The backbone features are not the same."
+            assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), (
+                "The backbone features are not the same."
+            )

        # Test pixel decoder
        mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
@ -875,9 +875,9 @@ def test(
        for original_model_feature, our_model_feature in zip(
            multi_scale_features, our_model_output.pixel_decoder_hidden_states
        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The pixel decoder feature are not the same"
+            assert torch.allclose(original_model_feature, our_model_feature, atol=tolerance), (
+                "The pixel decoder feature are not the same"
+            )

        # Let's test the full model
        tr_complete = T.Compose(
@ -894,12 +894,12 @@ def test(

        assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
        assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
-        assert torch.allclose(
-            original_class_logits, our_class_logits, atol=tolerance
-        ), "The class logits are not the same."
-        assert torch.allclose(
-            original_mask_logits, our_mask_logits, atol=tolerance
-        ), "The predicted masks are not the same."
+        assert torch.allclose(original_class_logits, our_class_logits, atol=tolerance), (
+            "The class logits are not the same."
+        )
+        assert torch.allclose(original_mask_logits, our_mask_logits, atol=tolerance), (
+            "The predicted masks are not the same."
+        )

        logger.info("✅ Test passed!")

--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
@ -581,9 +581,9 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=1e-3
-            ), "The backbone features are not the same."
+            assert torch.allclose(original_model_feature, our_model_feature, atol=1e-3), (
+                "The backbone features are not the same."
+            )

        original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
            original_model_backbone_features
@ -602,9 +602,9 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro

        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))

-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
+        assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3), (
+            "The segmentation image is not the same."
+        )

        logger.info("✅ Test passed!")

--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
@ -125,31 +125,31 @@ def create_rename_keys(config):
            for i in range(3):
                rename_keys.append(
                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.weight",
+                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.weight",
                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
                    )
                )
                rename_keys.append(
                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.weight",
+                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.weight",
                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
                    )
                )
                rename_keys.append(
                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.bias",
+                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.bias",
                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
                    )
                )
                rename_keys.append(
                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_mean",
+                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.running_mean",
                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
                    )
                )
                rename_keys.append(
                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_var",
+                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i + 1}.norm.running_var",
                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
                    )
                )
--- a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
@ -129,7 +129,7 @@ def _convert_model(
    hf_model.load_state_dict(state_dict, strict=True)
    n_params = param_count(hf_model)

-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params")

    hf_model.eval()
    hf_model.to(device)
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@ -144,9 +144,9 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
        elif m_name == "kernel":
            array = np.transpose(array)
        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@ -99,9 +99,9 @@ def get_mobilevitv2_config(task_name, orig_cfg_file):
    orig_config = load_orig_config_file(orig_cfg_file)
    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
-    assert (
-        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
-    ), "Norm layers other than layer_norm_2d is not supported"
+    assert getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d", (
+        "Norm layers other than layer_norm_2d is not supported"
+    )
    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)

@ -151,7 +151,7 @@ def create_rename_keys(state_dict, base_model=False):
            k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.")
        for i in [1, 2]:
            if f"layer_{i}." in k:
-                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.")
+                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i - 1}.layer.")
        if ".exp_1x1." in k:
            k_new = k_new.replace(".exp_1x1.", ".expand_1x1.")
        if ".red_1x1." in k:
@ -159,11 +159,11 @@ def create_rename_keys(state_dict, base_model=False):

        for i in [3, 4, 5]:
            if f"layer_{i}.0." in k:
-                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.")
+                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i - 1}.downsampling_layer.")
            if f"layer_{i}.1.local_rep.0." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.")
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i - 1}.conv_kxk.")
            if f"layer_{i}.1.local_rep.1." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.")
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i - 1}.conv_1x1.")

        for i in [3, 4, 5]:
            if i == 3:
@ -176,15 +176,17 @@ def create_rename_keys(state_dict, base_model=False):
            for j in j_in:
                if f"layer_{i}.1.global_rep.{j}." in k:
                    k_new = k_new.replace(
-                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}."
+                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i - 1}.transformer.layer.{j}."
                    )
-            if f"layer_{i}.1.global_rep.{j+1}." in k:
+            if f"layer_{i}.1.global_rep.{j + 1}." in k:
                k_new = k_new.replace(
-                    f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm."
+                    f"layer_{i}.1.global_rep.{j + 1}.", f"{model_prefix}encoder.layer.{i - 1}.layernorm."
                )

            if f"layer_{i}.1.conv_proj." in k:
-                k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
+                k_new = k_new.replace(
+                    f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i - 1}.conv_projection."
+                )

        if "pre_norm_attn.0." in k:
            k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
--- a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
+++ b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
@ -56,7 +56,7 @@ def _read_h5_weights(group, current_key="", weights={}):
 def _convert_layer_names(name, gated_mlp=False):
    name = re.sub(
        r"layers\.functional(?:_(\d+))?\.layers",
-        lambda m: f'layers.{m.group(1) if m.group(1) else "0"}',
+        lambda m: f"layers.{m.group(1) if m.group(1) else '0'}",
        name,
        count=1,
    )
--- a/src/transformers/models/moshi/convert_moshi_transformers.py
+++ b/src/transformers/models/moshi/convert_moshi_transformers.py
@ -186,7 +186,7 @@ def _convert_model(
    hf_model.load_state_dict(state_dict, strict=True)
    n_params = param_count(hf_model)

-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params")

    hf_model.eval()
    hf_model.to(device)
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@ -719,9 +719,9 @@ def load_tf_weights_in_mt5(model, config, tf_checkpoint_path):
            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
            array = np.transpose(array)
        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
--- a/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
+++ b/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
@ -65,13 +65,13 @@ def get_args():
        "--hf_input_path",
        type=str,
        default=None,
-        help="A HF model path, " "e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
+        help="A HF model path, e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
    )
    parser.add_argument(
        "--hf_output_path",
        type=str,
        default=None,
-        help="Output HF model path, " "with the same format as above but user's own weights",
+        help="Output HF model path, with the same format as above but user's own weights",
    )
    parser.add_argument(
        "--precision",
--- a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
@ -82,7 +82,7 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weig
            remove_ignore_keys_(expert_state)
            expert_state = rename_fairseq_keys(expert_state, expert)
            save_path = os.path.join(
-                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
+                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts) + 1:05d}-of-???.bin")
            )
            torch.save(expert_state, save_path)
            sharded_state_dicts.append(expert_state.keys())
@ -91,7 +91,9 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weig
            )

    # Add the last block
-    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
+    save_path = os.path.join(
+        dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts) + 1:05d}-of-???.bin")
+    )
    shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
    remove_ignore_keys_(shared_weights)
    shared_weights = rename_fairseq_keys(shared_weights, None)
@ -108,8 +110,8 @@ def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weig
    # Otherwise, let's build the index
    weight_map = {}
    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
-        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
+        shard_file = weights_name.replace(".bin", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.bin")
+        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx + 1:05d}-of-???.bin"))
        os.rename(temp_filename, os.path.join(dump_path, shard_file))
        for key in shard:
            weight_map[key] = shard_file
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@ -1352,7 +1352,7 @@ class NllbMoeDecoder(NllbMoePreTrainedModel):
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

--- a/src/transformers/models/oneformer/convert_to_hf_oneformer.py
+++ b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
@ -394,11 +394,11 @@ class OriginalOneFormerCheckpointToOursConverter:
                [
                    (
                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.weight",
                    ),
                    (
                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.bias",
                    ),
                ]
            )
@ -531,11 +531,11 @@ class OriginalOneFormerCheckpointToOursConverter:
                [
                    (
                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.weight",
                    ),
                    (
                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
+                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx + 1}.bias",
                    ),
                ]
            )
@ -1010,9 +1010,9 @@ def test(
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-3
-            ), "The backbone features are not the same."
+            assert torch.allclose(original_model_feature, our_model_feature, atol=3e-3), (
+                "The backbone features are not the same."
+            )
        mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
            original_model_backbone_features
        )
@ -1025,9 +1025,9 @@ def test(
        for original_model_feature, our_model_feature in zip(
            original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-4
-            ), "The pixel decoder feature are not the same"
+            assert torch.allclose(original_model_feature, our_model_feature, atol=3e-4), (
+                "The pixel decoder feature are not the same"
+            )

        tr_complete = T.Compose(
            [
@ -1049,9 +1049,9 @@ def test(

        our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]

-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
+        assert torch.allclose(original_segmentation, our_segmentation, atol=1e-3), (
+            "The segmentation image is not the same."
+        )

        logger.info("✅ Test passed!")

--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@ -62,9 +62,9 @@ class TFAttention(keras.layers.Layer):

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
-        assert (
-            n_state % config.n_head == 0
-        ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
+        assert n_state % config.n_head == 0, (
+            f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
+        )
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@ -173,7 +173,7 @@ def _preprocess_resize_output_shape(image, output_shape):
        # multichannel case: append shape of last axis
        output_shape = output_shape + (image.shape[-1],)
    elif output_ndim < image.ndim:
-        raise ValueError("output_shape length cannot be smaller than the " "image number of dimensions")
+        raise ValueError("output_shape length cannot be smaller than the image number of dimensions")

    return image, output_shape

@ -345,10 +345,10 @@ class Owlv2ImageProcessor(BaseImageProcessor):
            else:
                anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
                if np.any(anti_aliasing_sigma < 0):
-                    raise ValueError("Anti-aliasing standard deviation must be " "greater than or equal to zero")
+                    raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
                elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
                    warnings.warn(
-                        "Anti-aliasing standard deviation greater than zero but " "not down-sampling along all axes"
+                        "Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
                    )
            filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
        else:
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@ -118,9 +118,9 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
                is_key_init = True
                break
            elif attribute == "position_embeddings":
-                assert (
-                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
-                ), "Hidden size has to match"
+                assert model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1], (
+                    "Hidden size has to match"
+                )
                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
                model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
                is_key_init = True
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@ -588,9 +588,9 @@ class ProphetNetPositionalEmbeddings(nn.Embedding):
        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)

    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
-        assert (position_ids is None) or (
-            self.padding_idx is None
-        ), "If position_ids is pre-computed then padding_idx should not be set."
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )

        if position_ids is None:
            if past_key_values is not None:
@ -784,9 +784,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
        self.head_dim = config.hidden_size // self.num_attn_heads
        self.ngram = config.ngram

-        assert (
-            self.head_dim * self.num_attn_heads == config.hidden_size
-        ), "config.hidden_size must be divisible by num_attn_heads"
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
        # key, value, query projection
        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
@ -1041,9 +1041,9 @@ class ProphetNetNgramSelfAttention(nn.Module):

        if predict_relative_position_buckets is None:
            key_sequence_length = attn_weights.shape[-1]
-            assert (
-                position_ids[0][0] == key_sequence_length - 1
-            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
            relative_positions = (
                torch.arange(0, key_sequence_length)
                .unsqueeze(0)
@ -1313,9 +1313,9 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_hidden_states = encoder_hidden_states + (hidden_states,)
@ -1488,9 +1488,9 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):

        # prepare attention mask
        if past_key_values is not None:
-            assert (
-                hidden_states.size(1) == 1
-            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )

            ngram_hidden_states = [
                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@ -162,7 +162,7 @@ def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
    elif pvt_size == "large":
        config_path = "Zetatech/pvt-large-224"
    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but '{pvt_size}' was given")
    config = PvtConfig(name_or_path=config_path)
    # load original model from https://github.com/whai362/PVT
    state_dict = torch.load(pvt_checkpoint, map_location="cpu")
@ -192,7 +192,7 @@ def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
    elif pvt_size == "large":
        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but '{pvt_size}' was given")

    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)

--- a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
+++ b/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
@ -203,8 +203,7 @@ def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folde
        config_path = "OpenGVLab/pvt_v2_b5"
    else:
        raise ValueError(
-            f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
-            f"'{pvt_v2_size}' was given"
+            f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but '{pvt_v2_size}' was given"
        )
    config = PvtV2Config.from_pretrained(config_path)
    # load original model from https://github.com/whai362/PVT
@ -248,9 +247,9 @@ def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folde
                f"'{pvt_v2_size}' was given"
            )

-        assert torch.allclose(
-            logits[0, :3], expected_slice_logits, atol=1e-4
-        ), "ImageNet weights not converted successfully."
+        assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4), (
+            "ImageNet weights not converted successfully."
+        )

        print("ImageNet weights verified, conversion successful.")

--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@ -697,9 +697,9 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )

        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
--- a/Show More
+++ b/Show More