diff --git a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
index e159549a105..0c27bd02a7d 100644
--- a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
+++ b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
@@ -41,7 +41,7 @@
     "from scipy import sparse\n",
     "from torch import nn\n",
     "\n",
-    "from transformers import *\n",
+    "from transformers import BertForQuestionAnswering\n",
     "\n",
     "\n",
     "os.chdir(\"../../\")"
@@ -307,7 +307,7 @@
     "            print(f\"Skip {name}\")\n",
     "            continue\n",
     "\n",
-    "        if type(param) == torch.Tensor:\n",
+    "        if isinstance(param, torch.Tensor):\n",
     "            if param.numel() == 1:\n",
     "                # module scale\n",
     "                # module zero_point\n",
@@ -319,13 +319,13 @@
     "                param = param.detach().numpy()\n",
     "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
     "\n",
-    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "        elif isinstance(param, (float, int, tuple)):\n",
     "            # float - tensor _packed_params.weight.scale\n",
     "            # int   - tensor _packed_params.weight.zero_point\n",
     "            # tuple - tensor _packed_params.weight.shape\n",
     "            hf.attrs[name] = param\n",
     "\n",
-    "        elif type(param) == torch.dtype:\n",
+    "        elif isinstance(param, torch.dtype):\n",
     "            # dtype - tensor _packed_params.dtype\n",
     "            hf.attrs[name] = dtype_2_str[param]\n",
     "\n",
@@ -370,7 +370,7 @@
     "        #             print(f\"Skip {name}\")\n",
     "        #             continue\n",
     "\n",
-    "        if type(param) == torch.Tensor:\n",
+    "        if isinstance(param, torch.Tensor):\n",
     "            if param.numel() == 1:\n",
     "                # module scale\n",
     "                # module zero_point\n",
@@ -382,13 +382,13 @@
     "                param = param.detach().numpy()\n",
     "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
     "\n",
-    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "        elif isinstance(param, (float, int, tuple)):\n",
     "            # float - tensor _packed_params.weight.scale\n",
     "            # int   - tensor _packed_params.weight.zero_point\n",
     "            # tuple - tensor _packed_params.weight.shape\n",
     "            hf.attrs[name] = param\n",
     "\n",
-    "        elif type(param) == torch.dtype:\n",
+    "        elif isinstance(param, torch.dtype):\n",
     "            # dtype - tensor _packed_params.dtype\n",
     "            hf.attrs[name] = dtype_2_str[param]\n",
     "\n",
@@ -471,10 +471,10 @@
     "    assert name in reconstructed_elementary_qtz_st, name\n",
     "\n",
     "for name, param in reconstructed_elementary_qtz_st.items():\n",
-    "    assert type(param) == type(elementary_qtz_st[name]), name\n",
-    "    if type(param) == torch.Tensor:\n",
+    "    assert isinstance(param, type(elementary_qtz_st[name])), name\n",
+    "    if isinstance(param, torch.Tensor):\n",
     "        assert torch.all(torch.eq(param, elementary_qtz_st[name])), name\n",
-    "    elif type(param) == np.ndarray:\n",
+    "    elif isinstance(param, np.ndarray):\n",
     "        assert (param == elementary_qtz_st[name]).all(), name\n",
     "    else:\n",
     "        assert param == elementary_qtz_st[name], name"
@@ -532,10 +532,10 @@
     "    assert name in reconstructed_qtz_st, name\n",
     "\n",
     "for name, param in reconstructed_qtz_st.items():\n",
-    "    assert type(param) == type(qtz_st[name]), name\n",
-    "    if type(param) == torch.Tensor:\n",
+    "    assert isinstance(param, type(qtz_st[name])), name\n",
+    "    if isinstance(param, torch.Tensor):\n",
     "        assert torch.all(torch.eq(param, qtz_st[name])), name\n",
-    "    elif type(param) == np.ndarray:\n",
+    "    elif isinstance(param, np.ndarray):\n",
     "        assert (param == qtz_st[name]).all(), name\n",
     "    else:\n",
     "        assert param == qtz_st[name], name"
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index a9fe0d75f5c..3d726767742 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -114,7 +114,7 @@ class ASTSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 0f642c5e8e8..b4b116bdfb0 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -270,7 +270,7 @@ class BeitSelfAttention(nn.Module):
         self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {(config.hidden_size,)} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 2d79c182008..d37eedea3f4 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -271,7 +271,7 @@ class FlaxBeitSelfAttention(nn.Module):
             self.config, "embedding_size"
         ):
             raise ValueError(
-                f"The hidden size {self.config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {self.config.hidden_size} is not a multiple of the number of attention "
                 f"heads {self.config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index 1b6834a5179..c86495cbbe2 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -271,7 +271,7 @@ class Data2VecVisionSelfAttention(nn.Module):
         self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {(config.hidden_size,)} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index dfb7753d6f9..66a556da818 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -186,7 +186,7 @@ class DeiTSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
index 7f82aacf6e8..aab3d4ff2de 100644
--- a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
@@ -345,7 +345,7 @@ class TvltSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
index dca17adf2b0..922d5fab9be 100644
--- a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
@@ -204,7 +204,7 @@ class ViTHybridSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index 33ec1c05499..3ba48b7026c 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -178,7 +178,7 @@ class Dinov2SelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
index bd9d181cdf3..dae5904b78e 100644
--- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -190,7 +190,7 @@ class Dinov2WithRegistersSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index a82227b4580..e4d55603e63 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -301,7 +301,7 @@ class DPTViTSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index c893938e428..94395bd2711 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -438,7 +438,7 @@ class FlavaSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py
index e01290b089f..7d4619480c3 100644
--- a/src/transformers/models/ijepa/modeling_ijepa.py
+++ b/src/transformers/models/ijepa/modeling_ijepa.py
@@ -194,7 +194,7 @@ class IJepaSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 7a4f03fdf51..4665ff0f0e5 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -501,7 +501,7 @@ class LukeSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index 7f2a23238e5..f41da2bafaf 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -215,7 +215,7 @@ class MobileViTSelfAttention(nn.Module):
 
         if hidden_size % config.num_attention_heads != 0:
             raise ValueError(
-                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 9939ddcb716..76397f160b5 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -262,7 +262,7 @@ class TFMobileViTSelfAttention(keras.layers.Layer):
 
         if hidden_size % config.num_attention_heads != 0:
             raise ValueError(
-                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index 5eee95398b3..44f4d9a8a86 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -112,7 +112,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
 
         # ensure we have as much audios as audio tokens
         num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
-        num_audios = 1 if type(audios) == np.ndarray else len(audios)
+        num_audios = 1 if isinstance(audios, np.ndarray) else len(audios)
         if num_audio_tokens != num_audios:
             raise ValueError(
                 f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 6e65ebf06d9..0e51cd98868 100755
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -201,7 +201,7 @@ class VideoMAESelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 5ffb4b65ffb..07ed544d041 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -322,7 +322,7 @@ class ViltSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index b026a31d0a4..2fd430c1019 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -189,7 +189,7 @@ class ViTSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 1595eb80ca3..86e71155d9c 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -362,7 +362,7 @@ class ViTMAESelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index d25611a41a6..79021a6b8b6 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -179,7 +179,7 @@ class ViTMSNSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index d89f95e26b5..b4a1acd3361 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -109,7 +109,7 @@ class VitPoseBackboneSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
index 22877c842f9..4ef0f29bc84 100755
--- a/src/transformers/models/vivit/modeling_vivit.py
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -172,7 +172,7 @@ class VivitSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 729fd1b354b..5801e0bca28 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -237,7 +237,7 @@ class YolosSelfAttention(nn.Module):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )