From 1de7dc7403b3b89ec421d43a8c9ee245211a61f6 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 26 Jun 2024 21:59:08 +0100
Subject: [PATCH] Skip tests properly (#31308)

* Skip tests properly

* [test_all]

* Add 'reason' as kwarg for skipTest

* [test_all] Fix up

* [test_all]
---
 docs/source/de/testing.md                     |  16 +-
 docs/source/en/testing.md                     |  16 +-
 docs/source/ja/testing.md                     |  11 +-
 docs/source/ko/testing.md                     | 188 ++++----
 src/transformers/testing_utils.py             |  30 +-
 tests/deepspeed/test_deepspeed.py             |  10 +-
 tests/extended/test_trainer_ext.py            |   2 +-
 tests/generation/test_utils.py                |  78 ++--
 .../models/albert/test_tokenization_albert.py |   2 +-
 tests/models/align/test_modeling_align.py     |  11 +-
 tests/models/altclip/test_modeling_altclip.py |   8 +-
 tests/models/bark/test_modeling_bark.py       |  10 +-
 tests/models/bart/test_modeling_bart.py       |   3 +-
 tests/models/bart/test_tokenization_bart.py   |   1 +
 .../barthez/test_tokenization_barthez.py      |   2 +-
 tests/models/beit/test_modeling_beit.py       |   4 +-
 tests/models/bert/test_modeling_bert.py       |   2 +-
 tests/models/bert/test_tokenization_bert.py   |   2 +-
 .../models/big_bird/test_modeling_big_bird.py |   2 +-
 .../big_bird/test_tokenization_big_bird.py    |   2 +-
 .../test_modeling_bigbird_pegasus.py          |   9 +-
 tests/models/biogpt/test_modeling_biogpt.py   |   2 +-
 .../blenderbot/test_modeling_blenderbot.py    |   2 +-
 .../test_modeling_blenderbot_small.py         |   2 +-
 .../models/blip/test_image_processing_blip.py |   8 +-
 tests/models/blip/test_modeling_blip.py       |  18 +-
 tests/models/blip/test_modeling_blip_text.py  |   2 +
 tests/models/blip_2/test_modeling_blip_2.py   |   2 +
 tests/models/bloom/test_modeling_bloom.py     |   2 +-
 tests/models/bloom/test_tokenization_bloom.py |   2 +-
 tests/models/byt5/test_tokenization_byt5.py   |   6 +-
 .../camembert/test_tokenization_camembert.py  |   2 +-
 tests/models/canine/test_modeling_canine.py   |   6 +-
 .../models/canine/test_tokenization_canine.py |  13 +-
 .../test_image_processing_chinese_clip.py     |  18 +-
 .../test_modeling_chinese_clip.py             |   6 +-
 tests/models/clap/test_modeling_clap.py       |   2 +-
 tests/models/clip/test_modeling_clip.py       |  11 +-
 tests/models/clip/test_tokenization_clip.py   |   3 +-
 tests/models/clipseg/test_modeling_clipseg.py |  13 +-
 tests/models/clvp/test_tokenization_clvp.py   |   2 +-
 .../test_tokenization_code_llama.py           |  15 +-
 .../codegen/test_tokenization_codegen.py      |   4 +-
 .../models/cohere/test_tokenization_cohere.py |   2 +-
 .../test_modeling_conditional_detr.py         |   2 +-
 .../models/convbert/test_modeling_convbert.py |   2 +-
 .../convnextv2/test_modeling_convnextv2.py    |   4 +-
 tests/models/cpmant/test_modeling_cpmant.py   |   2 +-
 .../data2vec/test_modeling_data2vec_audio.py  |  11 +-
 .../data2vec/test_modeling_data2vec_vision.py |   6 +-
 tests/models/dbrx/test_modeling_dbrx.py       |   8 +-
 .../test_tokenization_deberta_v2.py           |   4 +-
 .../test_modeling_deformable_detr.py          |   6 +-
 tests/models/deit/test_modeling_deit.py       |   4 +-
 tests/models/detr/test_modeling_detr.py       |   2 +-
 tests/models/dinat/test_modeling_dinat.py     |   2 +-
 .../distilbert/test_modeling_distilbert.py    |   2 +-
 .../models/donut/test_modeling_donut_swin.py  |   2 +-
 .../electra/test_tokenization_electra.py      |   2 +-
 tests/models/encodec/test_modeling_encodec.py |  32 +-
 .../test_modeling_encoder_decoder.py          |   3 +
 tests/models/ernie/test_modeling_ernie.py     |   3 +-
 tests/models/esm/test_modeling_esm.py         |   4 +-
 tests/models/esm/test_modeling_esmfold.py     |  40 +-
 tests/models/falcon/test_modeling_falcon.py   |   4 +-
 ...test_tokenization_fastspeech2_conformer.py |   2 +-
 .../models/flaubert/test_modeling_flaubert.py |   2 +-
 tests/models/flava/test_modeling_flava.py     |  55 ++-
 tests/models/fnet/test_modeling_fnet.py       |   1 +
 tests/models/fnet/test_tokenization_fnet.py   |   4 +-
 tests/models/fsmt/test_modeling_fsmt.py       |  14 +-
 tests/models/fsmt/test_tokenization_fsmt.py   |   4 +-
 tests/models/fuyu/test_modeling_fuyu.py       |   6 +-
 tests/models/gemma/test_modeling_gemma.py     |  14 +-
 tests/models/gemma/test_tokenization_gemma.py |  19 +-
 tests/models/git/test_modeling_git.py         |   2 +
 tests/models/glpn/test_modeling_glpn.py       |   6 +-
 tests/models/gpt2/test_tokenization_gpt2.py   |   7 +-
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |  12 +-
 .../test_tokenization_gpt_neox_japanese.py    |   3 +-
 .../models/groupvit/test_modeling_groupvit.py |   6 +-
 .../herbert/test_tokenization_herbert.py      |   2 +-
 tests/models/hubert/test_modeling_hubert.py   |  24 +-
 tests/models/ibert/test_modeling_ibert.py     |   2 +-
 .../idefics/test_image_processing_idefics.py  |   8 +-
 tests/models/idefics/test_modeling_idefics.py |  16 +-
 .../models/idefics2/test_modeling_idefics2.py |  14 +-
 .../test_image_processing_imagegpt.py         |   4 +-
 .../models/imagegpt/test_modeling_imagegpt.py |  10 +-
 .../models/informer/test_modeling_informer.py |   6 +-
 tests/models/jamba/test_modeling_jamba.py     |   6 +-
 tests/models/jetmoe/test_modeling_jetmoe.py   |   6 +-
 tests/models/kosmos2/test_modeling_kosmos2.py |   4 +-
 .../layoutlm/test_tokenization_layoutlm.py    |   1 +
 .../test_image_processing_layoutlmv2.py       |   2 +-
 .../layoutlmv2/test_modeling_layoutlmv2.py    |   2 +-
 .../test_tokenization_layoutlmv2.py           |  50 +-
 .../test_tokenization_layoutlmv3.py           |  50 +-
 .../layoutxlm/test_tokenization_layoutxlm.py  |  56 +--
 tests/models/led/test_modeling_led.py         |   2 +-
 tests/models/led/test_tokenization_led.py     |   1 +
 tests/models/levit/test_modeling_levit.py     |   4 +-
 tests/models/llama/test_modeling_llama.py     |   4 +-
 tests/models/llama/test_tokenization_llama.py |  17 +-
 .../test_image_processing_llava_next.py       |   4 +-
 .../longformer/test_modeling_longformer.py    |   4 +-
 .../test_tokenization_longformer.py           |   1 +
 tests/models/longt5/test_modeling_longt5.py   |   8 +-
 tests/models/luke/test_tokenization_luke.py   |   1 +
 tests/models/lxmert/test_modeling_lxmert.py   |   6 +-
 .../models/lxmert/test_tokenization_lxmert.py |   2 +-
 tests/models/marian/test_modeling_marian.py   |   9 +-
 .../markuplm/test_tokenization_markuplm.py    |  46 +-
 .../mask2former/test_modeling_mask2former.py  |   2 +-
 .../test_modeling_maskformer_swin.py          |   4 +-
 tests/models/mbart/test_modeling_mbart.py     |   4 +-
 tests/models/mbart/test_tokenization_mbart.py |   4 +-
 .../mbart50/test_tokenization_mbart50.py      |   2 +-
 .../test_modeling_megatron_bert.py            |   2 +-
 .../test_modeling_megatron_gpt2.py            |   2 +-
 .../mgp_str/test_tokenization_mgp_str.py      |   6 +-
 tests/models/mistral/test_modeling_mistral.py |  10 +-
 tests/models/mixtral/test_modeling_mixtral.py |   6 +-
 tests/models/mluke/test_tokenization_mluke.py |   1 +
 .../mobilebert/test_modeling_mobilebert.py    |   2 +-
 .../test_tokenization_mobilebert.py           |   2 +-
 tests/models/mpnet/test_modeling_mpnet.py     |   2 +-
 tests/models/mpt/test_modeling_mpt.py         |   2 +-
 tests/models/mra/test_modeling_mra.py         |   4 +-
 tests/models/mt5/test_modeling_mt5.py         |   4 +-
 .../models/musicgen/test_modeling_musicgen.py |  34 +-
 .../musicgen/test_processing_musicgen.py      |   5 +-
 .../test_modeling_musicgen_melody.py          |  29 +-
 tests/models/mvp/test_modeling_mvp.py         |   2 +-
 tests/models/mvp/test_tokenization_mvp.py     |   1 +
 tests/models/nllb/test_tokenization_nllb.py   |   8 +-
 .../models/nllb_moe/test_modeling_nllb_moe.py |   4 +-
 .../models/nougat/test_tokenization_nougat.py |   8 +-
 tests/models/olmo/test_modeling_olmo.py       |   4 +-
 .../test_image_processing_oneformer.py        |   1 +
 .../oneformer/test_modeling_oneformer.py      |   2 +-
 .../oneformer/test_processor_oneformer.py     |   2 +
 .../models/openai/test_tokenization_openai.py |   2 +-
 tests/models/opt/test_modeling_opt.py         |   2 +-
 .../owlv2/test_image_processing_owlv2.py      |   2 +-
 tests/models/owlv2/test_modeling_owlv2.py     |   4 +-
 tests/models/owlvit/test_modeling_owlvit.py   |   4 +-
 .../paligemma/test_modeling_paligemma.py      |   2 +-
 .../test_modeling_patchtsmixer.py             |   2 +-
 tests/models/pegasus/test_modeling_pegasus.py |   2 +-
 .../pegasus/test_tokenization_pegasus.py      |   8 -
 .../pegasus_x/test_modeling_pegasus_x.py      |   2 +-
 .../perceiver/test_modeling_perceiver.py      |   4 +-
 .../perceiver/test_tokenization_perceiver.py  |   7 +-
 .../persimmon/test_modeling_persimmon.py      |   2 +-
 .../test_image_processing_pix2struct.py       |   8 +-
 .../pix2struct/test_modeling_pix2struct.py    |  12 +-
 tests/models/plbart/test_modeling_plbart.py   |   4 +-
 .../poolformer/test_modeling_poolformer.py    |   6 +-
 .../test_feature_extraction_pop2piano.py      |   6 +-
 .../pop2piano/test_tokenization_pop2piano.py  |   6 +-
 .../prophetnet/test_modeling_prophetnet.py    |   6 +-
 tests/models/pvt/test_modeling_pvt.py         |   6 +-
 tests/models/pvt_v2/test_modeling_pvt_v2.py   |   6 +-
 tests/models/qwen2/test_modeling_qwen2.py     |   6 +-
 tests/models/qwen2/test_tokenization_qwen2.py |   8 +-
 .../qwen2_moe/test_modeling_qwen2_moe.py      |   6 +-
 tests/models/rag/test_modeling_rag.py         |   2 +-
 .../test_modeling_recurrent_gemma.py          |  29 +-
 .../models/reformer/test_modeling_reformer.py |  22 +-
 .../reformer/test_tokenization_reformer.py    |   4 +-
 .../roberta/test_tokenization_roberta.py      |   1 +
 .../roformer/test_tokenization_roformer.py    |   4 +-
 .../test_tokenization_seamless_m4t.py         |  18 +-
 .../segformer/test_modeling_segformer.py      |   6 +-
 tests/models/sew/test_modeling_sew.py         |  17 +-
 tests/models/sew_d/test_modeling_sew_d.py     |  17 +-
 .../siglip/test_image_processing_siglip.py    |   2 +-
 tests/models/siglip/test_modeling_siglip.py   |   4 +-
 .../models/siglip/test_tokenization_siglip.py |   4 +-
 .../test_modeling_speech_encoder_decoder.py   |   6 +-
 .../test_modeling_speech_to_text.py           |  16 +-
 .../models/speecht5/test_modeling_speecht5.py |  86 ++--
 .../speecht5/test_tokenization_speecht5.py    |   2 +
 .../starcoder2/test_modeling_starcoder2.py    |   6 +-
 .../test_modeling_switch_transformers.py      |   2 +-
 tests/models/t5/test_modeling_t5.py           |   4 +-
 tests/models/t5/test_tokenization_t5.py       |   2 +-
 .../test_modeling_table_transformer.py        |   2 +-
 tests/models/tapas/test_modeling_tapas.py     |   4 +-
 tests/models/tapas/test_tokenization_tapas.py |  32 +-
 .../test_modeling_time_series_transformer.py  |   2 +-
 .../timesformer/test_modeling_timesformer.py  |   2 +-
 .../test_modeling_timm_backbone.py            |  36 +-
 tests/models/trocr/test_modeling_trocr.py     |  10 +-
 tests/models/udop/test_modeling_udop.py       |   2 +-
 tests/models/udop/test_tokenization_udop.py   |  54 +--
 tests/models/umt5/test_modeling_umt5.py       |   4 +-
 .../unispeech/test_modeling_unispeech.py      |   7 +-
 .../test_modeling_unispeech_sat.py            |  22 +-
 .../models/videomae/test_modeling_videomae.py |   2 +-
 tests/models/vilt/test_modeling_vilt.py       |   8 +-
 .../test_modeling_vision_encoder_decoder.py   |   6 +-
 .../test_modeling_vision_text_dual_encoder.py |   2 +-
 tests/models/vitdet/test_modeling_vitdet.py   |   8 +-
 tests/models/vits/test_modeling_vits.py       |  10 +-
 tests/models/vits/test_tokenization_vits.py   |   8 +-
 .../models/wav2vec2/test_modeling_wav2vec2.py |  30 +-
 .../wav2vec2/test_tokenization_wav2vec2.py    |   4 +-
 .../test_modeling_wav2vec2_conformer.py       |  15 +-
 .../test_tokenization_wav2vec2_phoneme.py     |  14 +-
 tests/models/wavlm/test_modeling_wavlm.py     |   9 +-
 tests/models/whisper/test_modeling_whisper.py |  39 +-
 .../whisper/test_tokenization_whisper.py      |   3 +
 tests/models/x_clip/test_modeling_x_clip.py   |   6 +-
 tests/models/xglm/test_modeling_xglm.py       |   2 +-
 tests/models/xglm/test_tokenization_xglm.py   |   2 +-
 .../test_tokenization_xlm_roberta.py          |   4 +-
 tests/models/xlnet/test_modeling_xlnet.py     |   2 +-
 tests/models/yolos/test_modeling_yolos.py     |   2 +-
 tests/models/yoso/test_modeling_yoso.py       |   1 +
 .../test_pipelines_audio_classification.py    |   2 +-
 ..._pipelines_automatic_speech_recognition.py |   8 +-
 tests/pipelines/test_pipelines_common.py      |   2 +-
 .../test_pipelines_conversational.py          | 439 ++++++++++++++++++
 .../test_pipelines_depth_estimation.py        |   4 +-
 ...t_pipelines_document_question_answering.py |   2 +-
 .../test_pipelines_feature_extraction.py      |  10 +-
 tests/pipelines/test_pipelines_fill_mask.py   |   2 +-
 ...test_pipelines_image_feature_extraction.py |   6 +-
 .../test_pipelines_image_segmentation.py      |   2 +-
 .../test_pipelines_mask_generation.py         |   4 +-
 .../test_pipelines_object_detection.py        |   2 +-
 .../test_pipelines_text_generation.py         |   2 +-
 .../test_pipelines_video_classification.py    |   1 +
 ...est_pipelines_visual_question_answering.py |   2 +-
 ...ipelines_zero_shot_audio_classification.py |   4 +-
 ...ipelines_zero_shot_image_classification.py |   2 +-
 ...st_pipelines_zero_shot_object_detection.py |   4 +-
 tests/quantization/autoawq/test_awq.py        |   2 +-
 tests/quantization/bnb/test_4bit.py           |   2 +-
 tests/quantization/bnb/test_mixed_int8.py     |   2 +-
 .../quanto_integration/test_quanto.py         |  13 +-
 tests/test_cache_utils.py                     |   2 +-
 tests/test_image_processing_common.py         |   8 +-
 tests/test_modeling_common.py                 | 240 ++++------
 tests/test_modeling_utils.py                  |  14 +-
 tests/test_pipeline_mixin.py                  |  12 +-
 tests/test_tokenization_common.py             |  88 ++--
 tests/test_tokenization_utils.py              |   2 +-
 tests/tokenization/test_tokenization_fast.py  |  10 +-
 tests/trainer/test_trainer.py                 |   4 +-
 tests/utils/test_doc_samples.py               |   2 +-
 tests/utils/test_model_output.py              |   4 +-
 254 files changed, 1721 insertions(+), 1298 deletions(-)
 create mode 100644 tests/pipelines/test_pipelines_conversational.py

diff --git a/docs/source/de/testing.md b/docs/source/de/testing.md
index 1d68c11c3ba..100151e58c3 100644
--- a/docs/source/de/testing.md
+++ b/docs/source/de/testing.md
@@ -185,16 +185,16 @@ pytest -k "test and ada" tests/test_optimization.py
 
 Manchmal müssen Sie `accelerate` Tests für Ihre Modelle ausführen. Dazu fügen Sie einfach `-m accelerate_tests` zu Ihrem Befehl hinzu, wenn Sie diese Tests bei einem `OPT`-Lauf ausführen möchten:
 ```bash
-RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
 ```
 
 
-### Dokumentationstests ausführen 
+### Dokumentationstests ausführen
 
-Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind. 
-Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden: 
+Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind.
+Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden:
 
-```python 
+```python
 r"""
 Returns:
 
@@ -217,8 +217,8 @@ Example:
 
 ```
 
-Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen: 
-```bash 
+Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen:
+```bash
 pytest --doctest-modules <path_to_file_or_dir>
 ```
 Wenn die Datei eine Markdown-Erweiterung hat, sollten Sie das Argument `--doctest-glob="*.md"` hinzufügen.
@@ -862,7 +862,7 @@ Code, der fehlerhaft ist, einen schlechten Zustand verursacht, der sich auf ande
 - Hier sehen Sie, wie Sie einen ganzen Test bedingungslos überspringen können:
 
 ```python no-style
-@unittest.skip("this bug needs to be fixed")
+@unittest.skip(reason="this bug needs to be fixed")
 def test_feature_x():
 ```
 
diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md
index 4649059872a..e26411f69e1 100644
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@@ -184,16 +184,16 @@ pytest -k "test and ada" tests/test_optimization.py
 Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run:
 
 ```bash
-RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
 ```
 
 
-### Run documentation tests 
+### Run documentation tests
 
-In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. 
-As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): 
+In order to test whether the documentation examples are correct, you should check that the `doctests` are passing.
+As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035):
 
-```python 
+```python
 r"""
 Returns:
 
@@ -216,8 +216,8 @@ Example:
 
 ```
 
-Just run the following line to automatically test every docstring example in the desired file: 
-```bash 
+Just run the following line to automatically test every docstring example in the desired file:
+```bash
 pytest --doctest-modules <path_to_file_or_dir>
 ```
 If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument.
@@ -881,7 +881,7 @@ code that's buggy causes some bad state that will affect other tests, do not use
 - Here is how to skip whole test unconditionally:
 
 ```python no-style
-@unittest.skip("this bug needs to be fixed")
+@unittest.skip(reason="this bug needs to be fixed")
 def test_feature_x():
 ```
 
diff --git a/docs/source/ja/testing.md b/docs/source/ja/testing.md
index 00a51f13811..8831d48a3bd 100644
--- a/docs/source/ja/testing.md
+++ b/docs/source/ja/testing.md
@@ -171,16 +171,16 @@ pytest -k "test and ada" tests/test_optimization.py
 時々、モデルに対して `accelerate` テストを実行する必要があります。たとえば、`OPT` 実行に対してこれらのテストを実行したい場合、コマンドに `-m accelerate_tests` を追加するだけで済みます：
 
 ```bash
-RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
 ```
 
-### Run documentation tests 
+### Run documentation tests
 
 ドキュメンテーションの例が正しいかどうかをテストするには、`doctests` が合格しているかを確認する必要があります。
 例として、[`WhisperModel.forward` のドックストリング](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)を使用しましょう。
 
 
-```python 
+```python
 r"""
 Returns:
 
@@ -205,7 +205,7 @@ Example:
 
 指定したファイル内のすべてのドックストリング例を自動的にテストするために、以下の行を実行してください：
 
-```bash 
+```bash
 pytest --doctest-modules <path_to_file_or_dir>
 ```
 
@@ -809,7 +809,7 @@ with ExtendSysPath(f"{bindir}/.."):
 
 
 ```python no-style
-@unittest.skip("this bug needs to be fixed")
+@unittest.skip(reason="this bug needs to be fixed")
 def test_feature_x():
 ```
 
@@ -1211,4 +1211,3 @@ cmd_that_may_fail || true
 
 - [Github Actions:](https://github.com/actions/toolkit/issues/399)
 - [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
-
diff --git a/docs/source/ko/testing.md b/docs/source/ko/testing.md
index 390a1c19baa..fd3f548eeb8 100644
--- a/docs/source/ko/testing.md
+++ b/docs/source/ko/testing.md
@@ -26,19 +26,19 @@ rendered properly in your Markdown viewer.
 
 ## Transformers 테스트 방법[[how-transformers-are-tested]]
 
-1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은 
-   이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면 
+1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은
+   이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면
    사용자의 로컬 환경에서 동일하게 재현해 볼 수 있습니다.
 
    이 CI 작업은 `@slow` 테스트를 실행하지 않습니다.
 
 2. [github actions](https://github.com/huggingface/transformers/actions)에 의해 실행되는 작업은 3개입니다:
 
-   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): 
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml):
     torch hub integration이 작동하는지 확인합니다.
 
-   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다. 
-    이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다. 
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다.
+    이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다.
     (model card, notebook, 기타 등등을 추가한 경우 실행되지 않도록 하기 위해서입니다)
 
    - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): `tests` 및 `examples`에서
@@ -61,7 +61,7 @@ RUN_SLOW=1 pytest examples/
 
 ### 실행할 테스트 선택[[choosing-which-tests-to-run]]
 
-이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다. 
+이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다.
 모든 내용을 읽은 후에도, 더 자세한 내용이 필요하다면 [여기](https://docs.pytest.org/en/latest/usage.html)에서 확인할 수 있습니다.
 
 다음은 가장 유용한 테스트 실행 방법 몇 가지입니다.
@@ -186,7 +186,7 @@ pytest -k "test and ada" tests/test_optimization.py
 모델에서 `accelerate` 테스트를 실행해야 할 때가 있습니다. 이를 위해서는 명령어에 `-m accelerate_tests`를 추가하면 됩니다.
 예를 들어, `OPT`에서 이러한 테스트를 실행하려면 다음과 같습니다:
 ```bash
-RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
 ```
 
 ### 문서 테스트 실행[[run-documentation-tests]]
@@ -194,7 +194,7 @@ RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
 예시 문서가 올바른지 테스트하려면 `doctests`가 통과하는지 확인해야 합니다.
 예를 들어, [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)를 사용해 봅시다:
 
-```python 
+```python
 r"""
 Returns:
 
@@ -218,7 +218,7 @@ Example:
 ```
 
 원하는 파일의 모든 docstring 예제를 자동으로 테스트하려면 다음 명령을 실행하면 됩니다:
-```bash 
+```bash
 pytest --doctest-modules <path_to_file_or_dir>
 ```
 파일의 확장자가 markdown인 경우 `--doctest-glob="*.md"` 인수를 추가해야 합니다.
@@ -240,9 +240,9 @@ pytest --picked
 
 ### 소스 수정 시 실패한 테스트 자동 재실행[[automatically-rerun-failed-tests-on-source-modification]]
 
-[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고, 
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고,
 파일을 수정한 후에 파일을 계속 재실행하여 테스트가 성공할 때까지 기다리는 매우 유용한 기능을 제공합니다.
-따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다. 
+따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다.
 모든 테스트가 통과될 때까지 이 과정을 반복한 후 다시 전체 실행이 이루어집니다.
 
 ```bash
@@ -252,7 +252,7 @@ pip install pytest-xdist
 재귀적 모드의 사용: `pytest -f` 또는 `pytest --looponfail`
 
 파일의 변경 사항은 `looponfailroots` 루트 디렉터리와 해당 내용을 (재귀적으로) 확인하여 감지됩니다.
-이 값의 기본값이 작동하지 않는 경우, 
+이 값의 기본값이 작동하지 않는 경우,
 `setup.cfg`의 설정 옵션을 변경하여 프로젝트에서 변경할 수 있습니다:
 
 ```ini
@@ -275,7 +275,7 @@ looponfailroots = transformers tests
 
 ### 특정 테스트 모듈 건너뛰기[[skip-a-test-module]]
 
-모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다. 
+모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다.
 예를 들어, `test_modeling_*.py` 테스트를 제외한 모든 테스트를 실행하려면 다음을 사용할 수 있습니다:
 
 ```bash
@@ -292,19 +292,19 @@ pytest --cache-clear tests
 
 ### 테스트를 병렬로 실행[[running-tests-in-parallel]]
 
-이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해 
+이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해
 `pytest-xdist` 플러그인(`-n X` 인수, 예를 들어 `-n 2`를 사용하여 2개의 병렬 작업 실행)을 통해 실행됩니다.
 
-`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다. 
+`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다.
 `--dist=loadfile`은 하나의 파일에 있는 테스트를 동일한 프로세스로 그룹화합니다.
 
 실행된 테스트의 순서가 다르고 예측할 수 없기 때문에, `pytest-xdist`로 테스트 스위트를 실행하면 실패가 발생할 수 있습니다 (검출되지 않은 결합된 테스트가 있는 경우).
-이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서 
+이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서
 실패하는 시퀀스를 최소화하는 데에 도움이 됩니다.
 
 ### 테스트 순서와 반복[[test-order-and-repetition]]
 
-잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해 
+잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해
 테스트를 여러 번, 연속으로, 무작위로 또는 세트로 반복하는 것이 좋습니다.
 그리고 직접적인 여러 번의 반복은 DL의 무작위성에 의해 발견되는 일부 문제를 감지하는 데에도 유용합니다.
 
@@ -341,10 +341,10 @@ pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
 pip install pytest-random-order
 ```
 
-중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다. 
+중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다.
 구성 변경이나 커맨드 라인 옵션이 필요하지 않습니다.
 
-앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다. 
+앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다.
 `pytest-random-order`가 설치되면 해당 세션에서 사용된 랜덤 시드가 출력되며 예를 들어 다음과 같습니다:
 
 ```bash
@@ -364,7 +364,7 @@ Using --random-order-seed=573663
 ```
 
 정확히 동일한 테스트 목록(또는 목록이 없음)을 사용하는 경우에만 정확한 순서를 재현합니다.
-목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다. 
+목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다.
 예를 들어 다음과 같습니다:
 
 ```bash
@@ -377,19 +377,19 @@ pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.p
 pytest --random-order-bucket=none
 ```
 
-기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다. 
+기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다.
 또한 `class`, `package`, `global` 및 `none` 수준에서도 섞을 수 있습니다.
 자세한 내용은 해당 [문서](https://github.com/jbasko/pytest-random-order)를 참조하세요.
 
 또 다른 무작위화의 대안은 [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly)입니다.
-이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다. 
+이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다.
 설치 후에는 자동으로 적용되는 문제도 동일하게 가집니다.
 
 ### 외관과 느낌을 변경[[look-and-feel-variations]
 
 #### pytest-sugar 사용[[pytest-sugar]]
 
-[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고, 
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고,
 진행 상황 바를 추가하며, 실패한 테스트와 검증을 즉시 표시하는 플러그인입니다. 설치하면 자동으로 활성화됩니다.
 
 ```bash
@@ -416,7 +416,7 @@ pytest --pspec tests/test_optimization.py
 
 #### 실패한 테스트 즉시 표시[[instantly-shows-failed-tests]]
 
-[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고 
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고
 실패 및 오류를 즉시 표시합니다.
 
 ```bash
@@ -435,7 +435,7 @@ GPU가 활성화된 환경에서, CPU 전용 모드로 테스트하려면 `CUDA_
 CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
 ```
 
-또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다. 
+또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다.
 예를 들어, GPU `0` 및 `1`이 있는 경우 다음을 실행할 수 있습니다:
 
 ```bash
@@ -444,7 +444,7 @@ CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
 
 이렇게 하면 다른 GPU에서 다른 작업을 실행하려는 경우 유용합니다.
 
-일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다. 
+일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다.
 다음 스킵 데코레이터는 테스트의 요구 사항을 CPU/GPU/TPU별로 설정하는 데 사용됩니다:
 
 - `require_torch` - 이 테스트는 torch에서만 실행됩니다.
@@ -480,7 +480,7 @@ def test_example_with_multi_gpu():
 def test_tf_thing_with_tensorflow():
 ```
 
-이러한 데코레이터는 중첩될 수 있습니다. 
+이러한 데코레이터는 중첩될 수 있습니다.
 예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:
 
 ```python no-style
@@ -489,7 +489,7 @@ def test_tf_thing_with_tensorflow():
 def test_example_slow_on_gpu():
 ```
 
-`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다. 
+`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다.
 다음은 올바른 사용 예입니다:
 
 ```python no-style
@@ -498,7 +498,7 @@ def test_example_slow_on_gpu():
 def test_integration_foo():
 ```
 
-`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다. 
+`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다.
 하지만 unittest가 아닌 경우에만 작동합니다.
 
 테스트 내부에서 다음을 사용할 수 있습니다:
@@ -513,7 +513,7 @@ n_gpu = get_gpu_count()  #torch와 tf와 함께 작동
 
 ### 분산 훈련[[distributed-training]]
 
-`pytest`는 분산 훈련을 직접적으로 다루지 못합니다. 
+`pytest`는 분산 훈련을 직접적으로 다루지 못합니다.
 이를 시도하면 하위 프로세스가 올바른 작업을 수행하지 않고 `pytest`라고 생각하기에 테스트 스위트를 반복해서 실행하게 됩니다.
 그러나 일반 프로세스를 생성한 다음 여러 워커를 생성하고 IO 파이프를 관리하도록 하면 동작합니다.
 
@@ -532,7 +532,7 @@ CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
 
 ### 출력 캡처[[output-capture]]
 
-테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다. 
+테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다.
 테스트나 설정 메소드가 실패하면 캡처된 출력은 일반적으로 실패 추적 정보와 함께 표시됩니다.
 
 출력 캡처를 비활성화하고 `stdout` 및 `stderr`를 정상적으로 받으려면 `-s` 또는 `--capture=no`를 사용하세요:
@@ -563,7 +563,7 @@ pytest --color=no tests/utils/test_logging.py
 pytest --pastebin=failed tests/utils/test_logging.py
 ```
 
-이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다. 
+이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다.
 일반적인 테스트를 선택할 수도 있고 혹은 특정 실패만 보내려면 `-x`와 같이 추가할 수도 있습니다.
 
 전체 테스트 세션 로그에 대한 URL을 생성합니다:
@@ -574,17 +574,17 @@ pytest --pastebin=all tests/utils/test_logging.py
 
 ## 테스트 작성[[writing-tests]]
 
-🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만, 
+🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만,
 `pytest`에서 실행되므로 대부분의 경우 두 시스템의 기능을 사용할 수 있습니다.
 
-지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만, 
+지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만,
 기억해야 할 중요한 점은 대부분의 `pytest` fixture가 작동하지 않는다는 것입니다.
 파라미터화도 작동하지 않지만, 우리는 비슷한 방식으로 작동하는 `parameterized` 모듈을 사용합니다.
 
 
 ### 매개변수화[[parametrization]]
 
-동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다. 
+동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다.
 테스트 내에서 이 작업을 수행할 수 있지만, 그렇게 하면 하나의 인수 세트에 대해 테스트를 실행할 수 없습니다.
 
 ```python
@@ -605,7 +605,7 @@ class TestMathUnitTest(unittest.TestCase):
         assert_equal(math.floor(input), expected)
 ```
 
-이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가 
+이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가
 매개변수 목록의 해당 인수에 할당되는 것으로 3번 실행될 것입니다.
 
 그리고 `negative` 및 `integer` 매개변수 집합만 실행하려면 다음과 같이 실행할 수 있습니다:
@@ -620,7 +620,7 @@ pytest -k "negative and integer" tests/test_mytest.py
 pytest -k "not negative" tests/test_mytest.py
 ```
 
-앞에서 언급한 `-k` 필터를 사용하는 것 외에도, 
+앞에서 언급한 `-k` 필터를 사용하는 것 외에도,
 각 서브 테스트의 정확한 이름을 확인한 후에 일부 혹은 전체 서브 테스트를 실행할 수 있습니다.
 
 ```bash
@@ -641,10 +641,10 @@ test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
 pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
 ```
 
-`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은 
+`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은
 `unittests`와 `pytest` 테스트 모두에서 작동합니다.
 
-그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다. 
+그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다.
 주로 `examples` 하위에 있습니다).
 
 다음은 `pytest`의 `parametrize` 마커를 사용한 동일한 예입니다:
@@ -666,8 +666,8 @@ def test_floor(name, input, expected):
     assert_equal(math.floor(input), expected)
 ```
 
-`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면 
-`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다. 
+`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면
+`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다.
 단, 이 매개변수화 함수는 서브 테스트의 이름 집합을 약간 다르게 생성합니다. 다음과 같은 모습입니다:
 
 ```bash
@@ -694,7 +694,7 @@ pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[i
 
 ### 파일 및 디렉터리[[files-and-directories]]
 
-테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다. 
+테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다.
 테스트가 여러 디렉터리에서 호출되거나 깊이가 다른 하위 디렉터리에 있을 수 있기 때문에 그 위치를 아는 것은 간단하지 않습니다.
 `transformers.test_utils.TestCasePlus`라는 헬퍼 클래스는 모든 기본 경로를 처리하고 간단한 액세서를 제공하여 이 문제를 해결합니다:
 
@@ -717,7 +717,7 @@ pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[i
   - `repo_root_dir_str`
   - `src_dir_str`
 
-위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다. 
+위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다.
 예를 들어 다음과 같습니다:
 
 ```python
@@ -729,7 +729,7 @@ class PathExampleTest(TestCasePlus):
         data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
 ```
 
-만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다. 
+만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다.
 예를 들어 다음과 같습니다:
 
 ```python
@@ -743,14 +743,14 @@ class PathExampleTest(TestCasePlus):
 
 ### 임시 파일 및 디렉터리[[temporary-files-and-directories]]
 
-고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다. 
-이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다.  
+고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다.
+이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다.
 따라서 이러한 요구 사항을 충족시켜주는 `tempfile`과 같은 패키지를 사용하는 것이 중요합니다.
 
-그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며, 
+그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며,
 재실행되는 각 테스트마다 임시 파일이나 디렉터리의 경로에 대해 무작위 값이 아닌 정확한 값을 알고 싶을 것입니다.
 
-`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다. 
+`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다.
 이 클래스는 `unittest.TestCase`의 하위 클래스이므로, 우리는 이것을 테스트 모듈에서 쉽게 상속할 수 있습니다.
 
 다음은 해당 클래스를 사용하는 예시입니다:
@@ -773,7 +773,7 @@ def test_whatever(self):
     tmp_dir = self.get_auto_remove_tmp_dir()
 ```
 
-`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다. 
+`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다.
 이는 테스트의 종료 단계에서 자동으로 제거됩니다.
 
 - 선택한 경로로 임시 디렉터리 생성 후에 테스트 시작 전에 비어 있는 상태인지 확인하고, 테스트 후에는 비우지 마세요.
@@ -783,10 +783,10 @@ def test_whatever(self):
     tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
 ```
 
-이것은 디버깅할 때 특정 디렉터리를 모니터링하고, 
+이것은 디버깅할 때 특정 디렉터리를 모니터링하고,
 그 디렉터리에 이전에 실행된 테스트가 데이터를 남기지 않도록 하는 데에 유용합니다.
 
-- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며 
+- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며
 다음 중 하나의 동작으로 이어집니다:
 
   - `before=True`: 테스트 시작 시 임시 디렉터리가 항상 지워집니다.
@@ -804,7 +804,7 @@ def test_whatever(self):
 
 <Tip>
 
-각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며, 
+각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며,
 별도로 요청하지 않는 한 모두 자동으로 제거됩니다.
 
 </Tip>
@@ -826,17 +826,17 @@ with ExtendSysPath(f"{bindir}/.."):
 
 ### 테스트 건너뛰기[[skipping-tests]]
 
-이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다. 
+이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다.
 이 테스트를 주 저장소에 커밋하려면 `make test` 중에 건너뛰도록 해야 합니다.
 
 방법:
 
-- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다. 
-일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나 
+- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다.
+일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나
 외부 리소스(예를 들어 데이터베이스)에 의존하는 테스트를 건너뛰는 것이 있습니다.
 
-- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다. 
-일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다. 
+- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다.
+일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다.
 `xfail`로 표시된 테스트가 예상대로 실패하지 않고 통과된 경우, 이것은 xpass이며 테스트 결과 요약에 기록됩니다.
 
 두 가지 중요한 차이점 중 하나는 `skip`은 테스트를 실행하지 않지만 `xfail`은 실행한다는 것입니다.
@@ -847,7 +847,7 @@ with ExtendSysPath(f"{bindir}/.."):
 - 전체 테스트를 무조건 건너뛰려면 다음과 같이 할 수 있습니다:
 
 ```python no-style
-@unittest.skip("this bug needs to be fixed")
+@unittest.skip(reason="this bug needs to be fixed")
 def test_feature_x():
 ```
 
@@ -920,7 +920,7 @@ class TestClass():
 
 ### 느린 테스트[[slow-tests]]
 
-테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다. 
+테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다.
 그리고 우리에게는 테스트 스위트가 CI를 통해 완료되기까지 한 시간을 기다릴 여유가 없습니다.
 따라서 필수 테스트를 위한 일부 예외를 제외하고 느린 테스트는 다음과 같이 표시해야 합니다.
 
@@ -936,7 +936,7 @@ def test_integration_foo():
 RUN_SLOW=1 pytest tests
 ```
 
-`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다. 
+`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다.
 그러므로 `@slow`와 나머지 건너뛰기 데코레이터 `@require_*`가 올바르게 작동되려면 마지막에 나열되어야 합니다. 다음은 올바른 사용 예입니다.
 
 ```python no-style
@@ -945,25 +945,25 @@ RUN_SLOW=1 pytest tests
 def test_integration_foo():
 ```
 
-이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다. 
+이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다.
 따라서 PR 제출 중에 일부 문제를 놓친 채로 병합될 수 있습니다.
-이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다. 
+이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다.
 하지만 PR을 제출하기 전에 자신의 컴퓨터에서 느린 테스트를 실행하는 것 또한 중요합니다.
 
 느린 테스트로 표시해야 하는지 여부를 결정하는 대략적인 결정 기준은 다음과 같습니다.
 
-만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인), 
+만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인),
 해당 테스트를 느린 테스트 스위트에서 실행해야 합니다.
-만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면, 
+만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면,
 해당 테스트를 느린 테스트 스위트에서 실행해야 합니다. 그리고 이 접근 방식을 보완하기 위해 예외를 만들어야 합니다.
 
-- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를 
+- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를
   느린 테스트로 설정해야 합니다.
-  새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다. 
+  새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다.
   이 내용은 아래 단락에서 설명됩니다.
 - 특별히 빠르게 실행되도록 최적화되지 않은 학습을 수행해야 하는 테스트는 느린 테스트로 설정해야 합니다.
-- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우 
-  예외를 도입하고 이를 `@slow`로 설정할 수 있습니다. 
+- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우
+  예외를 도입하고 이를 `@slow`로 설정할 수 있습니다.
   대용량 파일을 디스크에 저장하고 불러오는 자동 모델링 테스트는 `@slow`으로 표시된 테스트의 좋은 예입니다.
 - CI에서 1초 이내에 테스트가 완료되는 경우(다운로드 포함)에는 느린 테스트가 아니어야 합니다.
 
@@ -976,22 +976,22 @@ def test_integration_foo():
 grep tiny tests examples
 ```
 
-다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든 
-[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다. 
+다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든
+[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다.
 특정 모델의 아키텍처에 맞게 쉽게 조정할 수 있습니다.
 
-예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만, 
-로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다. 
+예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만,
+로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다.
 대신 CI 로그의 실행 속도 보고서를 확인하세요(`pytest --durations=0 tests`의 출력).
 
-이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다. 
+이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다.
 CI에서 테스트 스위트가 느려지기 시작하면 이 보고서의 맨 위 목록에 가장 느린 테스트가 표시됩니다.
 
 
 
 ### stdout/stderr 출력 테스트[[testing-the-stdout/stderr-output]]
 
-`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다. 
+`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다.
 다음과 같이 수행할 수 있습니다.
 
 ```python
@@ -1019,7 +1019,7 @@ def test_result_and_stdout(capsys):
     assert msg in err
 ```
 
-그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다. 
+그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다.
 그러므로 해당 경우에는 try/except를 사용해야 합니다.
 
 ```python
@@ -1061,11 +1061,11 @@ def test_result_and_stdout():
 ```
 
 `stdout` 캡처에 관련된 중요한 문제 중 하나는 보통 `print`에서 이전에 인쇄된 내용을 재설정하는 `\r` 문자가 포함될 수 있다는 것입니다.
-`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로 
+`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로
 `-s`가 있거나 없는 상태에서 태스트를 수행할 수 있으려면 캡처된 출력에 대해 추가적인 정리가 필요합니다.
 이 경우에는 `re.sub(r'~.*\r', '', buf, 0, re.M)`을 사용할 수 있습니다.
 
-하지만 도우미 컨텍스트 관리자 래퍼를 사용하면 
+하지만 도우미 컨텍스트 관리자 래퍼를 사용하면
 출력에 `\r`이 포함되어 있는지의 여부에 관계없이 모든 것을 자동으로 처리하므로 편리합니다.
 
 ```python
@@ -1108,7 +1108,7 @@ with CaptureStd() as cs:
 print(cs.err, cs.out)
 ```
 
-또한, 테스트의 디버깅을 지원하기 위해 
+또한, 테스트의 디버깅을 지원하기 위해
 이러한 컨텍스트 관리자는 기본적으로 컨텍스트에서 종료할 때 캡처된 스트림을 자동으로 다시 실행합니다.
 
 
@@ -1130,7 +1130,7 @@ assert cl.out, msg + "\n"
 
 ### 환경 변수를 이용하여 테스트[[testing-with-environment-variables]]
 
-특정 테스트의 환경 변수 영향을 검증하려면 
+특정 테스트의 환경 변수 영향을 검증하려면
 `transformers.testing_utils.mockenv`라는 도우미 데코레이터를 사용할 수 있습니다.
 
 ```python
@@ -1143,7 +1143,7 @@ class HfArgumentParserTest(unittest.TestCase):
         env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
 ```
 
-일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다.  
+일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다.
 헬퍼 클래스 `transformers.test_utils.TestCasePlus`가 도움이 됩니다:
 
 ```python
@@ -1156,8 +1156,8 @@ class EnvExampleTest(TestCasePlus):
         # 이제 `env`를 사용하여 외부 프로그램 호출
 ```
 
-테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라 
-`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며, 
+테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라
+`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며,
 현재 저장소에 대해 테스트가 수행되도록 `src` 디렉터리도 포함됩니다.
 테스트 호출 이전에 설정된 경우에는 `env[PYTHONPATH]`를 그대로 사용합니다.
 
@@ -1166,7 +1166,7 @@ class EnvExampleTest(TestCasePlus):
 
 ### 재현 가능한 결과 얻기[[getting-reproducible-results]]
 
-일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다. 
+일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다.
 이를 위해서는 다음과 같이 시드를 고정해야 합니다.
 
 ```python
@@ -1207,11 +1207,11 @@ pytest tests/utils/test_logging.py -W error::UserWarning --pdb
 셀프 푸시 워크플로우 CI 작업을 트리거하려면, 다음을 수행해야 합니다.
 
 1. `transformers` 원본에서 새 브랜치를 만듭니다(포크가 아닙니다!).
-2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다). 
-   또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은 
+2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다).
+   또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은
    [여기](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml)의 *push:*에서 확인할 수 있습니다.
 3. 이 브랜치에서 PR을 생성합니다
-4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다. 
+4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다.
    백로그가 있는 경우, 바로 실행되지 않을 수도 있습니다.
 
 
@@ -1219,13 +1219,13 @@ pytest tests/utils/test_logging.py -W error::UserWarning --pdb
 
 ## 실험적인 CI 기능 테스트[[testing-Experimental-CI-Features]]
 
-CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다. 
+CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다.
 따라서 새로운 CI 기능을 추가하는 경우 다음과 같이 수행해야 합니다.
 
 1. 테스트해야 할 내용을 테스트하는 새로운 전용 작업을 생성합니다.
 2. 새로운 작업은 항상 성공해야만 녹색 ✓를 받을 수 있습니다(아래에 자세한 내용이 있습니다).
-3. 다양한 PR 유형에 대한 확인을  위해 
-   (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.) 
+3. 다양한 PR 유형에 대한 확인을  위해
+   (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.)
    며칠 동안 실험 작업의 로그를 모니터링하면서 실행해봅니다.
    (의도적으로 항상 녹색을 표시하므로 작업 전체가 녹색은 아니라는 점에 유의합니다.)
 4. 모든 것이 안정적인지 확인한 후, 새로운 변경 사항을 기존 작업에 병합합니다.
@@ -1234,7 +1234,7 @@ CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기
 
 그러나 새로운 CI 기능이 개발 중인 동안, 항상 성공하도록 할 수 있는 방법은 무엇일까요?
 
-TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만, 
+TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만,
 현재 우리가 사용하는 CircleCI와 Github Actions는 이를 지원하지 않습니다.
 
 따라서 다음과 같은 해결책을 사용할 수 있습니다.
@@ -1264,12 +1264,12 @@ TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작
 cmd_that_may_fail || true
 ```
 
-결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서 
-`set +euo pipefail` 또는 기타 추가한 요소를 제거하여 
+결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서
+`set +euo pipefail` 또는 기타 추가한 요소를 제거하여
 실험 작업이 일반 CI 작동에 방해되지 않도록 해야 합니다.
 
-이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록 
-`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다. 
+이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록
+`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다.
 그러나 앞에서 언급한 바와 같이 CircleCI와 Github Actions는 현재 이러한 기능들 지원하지 않습니다.
 
 이 기능의 지원을 위한 투표에 참여하고 CI 관련 스레드들에서 이러한 상황을 확인할 수도 있습니다.
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 8dda057f1b9..60ff7815a97 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -226,7 +226,7 @@ def is_pt_tf_cross_test(test_case):
 
     """
     if not _run_pt_tf_cross_tests or not is_torch_available() or not is_tf_available():
-        return unittest.skip("test is PT+TF test")(test_case)
+        return unittest.skip(reason="test is PT+TF test")(test_case)
     else:
         try:
             import pytest  # We don't need a hard dependency on pytest in the main library
@@ -245,7 +245,7 @@ def is_pt_flax_cross_test(test_case):
 
     """
     if not _run_pt_flax_cross_tests or not is_torch_available() or not is_flax_available():
-        return unittest.skip("test is PT+FLAX test")(test_case)
+        return unittest.skip(reason="test is PT+FLAX test")(test_case)
     else:
         try:
             import pytest  # We don't need a hard dependency on pytest in the main library
@@ -262,7 +262,7 @@ def is_staging_test(test_case):
     Those tests will run using the staging environment of huggingface.co instead of the real model hub.
     """
     if not _run_staging:
-        return unittest.skip("test is staging test")(test_case)
+        return unittest.skip(reason="test is staging test")(test_case)
     else:
         try:
             import pytest  # We don't need a hard dependency on pytest in the main library
@@ -278,7 +278,7 @@ def is_pipeline_test(test_case):
     skipped.
     """
     if not _run_pipeline_tests:
-        return unittest.skip("test is pipeline test")(test_case)
+        return unittest.skip(reason="test is pipeline test")(test_case)
     else:
         try:
             import pytest  # We don't need a hard dependency on pytest in the main library
@@ -293,7 +293,7 @@ def is_agent_test(test_case):
     Decorator marking a test as an agent test. If RUN_TOOL_TESTS is set to a falsy value, those tests will be skipped.
     """
     if not _run_agent_tests:
-        return unittest.skip("test is an agent test")(test_case)
+        return unittest.skip(reason="test is an agent test")(test_case)
     else:
         try:
             import pytest  # We don't need a hard dependency on pytest in the main library
@@ -321,7 +321,7 @@ def tooslow(test_case):
     these will not be tested by the CI.
 
     """
-    return unittest.skip("test is too slow")(test_case)
+    return unittest.skip(reason="test is too slow")(test_case)
 
 
 def custom_tokenizers(test_case):
@@ -709,7 +709,7 @@ def require_torch_multi_gpu(test_case):
     To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu"
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
@@ -723,7 +723,7 @@ def require_torch_multi_accelerator(test_case):
     multi_accelerator: $ pytest -sv ./tests -k "multi_accelerator"
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     return unittest.skipUnless(backend_device_count(torch_device) > 1, "test requires multiple accelerators")(
         test_case
@@ -735,7 +735,7 @@ def require_torch_non_multi_gpu(test_case):
     Decorator marking a test that requires 0 or 1 GPU setup (in PyTorch).
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
@@ -747,7 +747,7 @@ def require_torch_non_multi_accelerator(test_case):
     Decorator marking a test that requires 0 or 1 accelerator setup (in PyTorch).
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     return unittest.skipUnless(backend_device_count(torch_device) < 2, "test requires 0 or 1 accelerator")(test_case)
 
@@ -757,7 +757,7 @@ def require_torch_up_to_2_gpus(test_case):
     Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch).
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
@@ -769,7 +769,7 @@ def require_torch_up_to_2_accelerators(test_case):
     Decorator marking a test that requires 0 or 1 or 2 accelerator setup (in PyTorch).
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return unittest.skip(reason="test requires PyTorch")(test_case)
 
     return unittest.skipUnless(backend_device_count(torch_device) < 3, "test requires 0 or 1 or 2 accelerators")
     (test_case)
@@ -806,7 +806,7 @@ def require_torch_multi_npu(test_case):
     To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
     """
     if not is_torch_npu_available():
-        return unittest.skip("test requires PyTorch NPU")(test_case)
+        return unittest.skip(reason="test requires PyTorch NPU")(test_case)
 
     return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
 
@@ -830,7 +830,7 @@ def require_torch_multi_xpu(test_case):
     To run *only* the multi_xpu tests, assuming all test names contain multi_xpu: $ pytest -sv ./tests -k "multi_xpu"
     """
     if not is_torch_xpu_available():
-        return unittest.skip("test requires PyTorch XPU")(test_case)
+        return unittest.skip(reason="test requires PyTorch XPU")(test_case)
 
     return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case)
 
@@ -1078,7 +1078,7 @@ def require_bitsandbytes(test_case):
         except ImportError:
             return test_case
     else:
-        return unittest.skip("test requires bitsandbytes and torch")(test_case)
+        return unittest.skip(reason="test requires bitsandbytes and torch")(test_case)
 
 
 def require_optimum(test_case):
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 58425fecbcb..2eb3a40b99d 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -108,13 +108,13 @@ def require_deepspeed_aio(test_case):
     Decorator marking a test that requires deepspeed aio (nvme)
     """
     if not is_deepspeed_available():
-        return unittest.skip("test requires deepspeed")(test_case)
+        return unittest.skip(reason="test requires deepspeed")(test_case)
 
     import deepspeed
     from deepspeed.ops.aio import AsyncIOBuilder
 
     if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
-        return unittest.skip("test requires deepspeed async-io")(test_case)
+        return unittest.skip(reason="test requires deepspeed async-io")(test_case)
     else:
         return test_case
 
@@ -643,7 +643,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             # print(trainer.model.b.item())
             # need to investigate at some point
             if (stage == ZERO3 and dtype == FP16) or (dtype == BF16):
-                return
+                self.skipTest(reason="When using zero3/fp16 or any/bf16 the optimizer seems run oddly")
 
             # it's enough that train didn't fail for this test, but we must check that
             # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
@@ -795,7 +795,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
         # ToDo: Currently, hf_optim + hf_scheduler resumes with the correct states and
         # also has same losses for few steps but then slowly diverges. Need to figure it out.
         if optim == HF_OPTIM and scheduler == HF_SCHEDULER:
-            return
+            self.skipTest(reason="hf_optim + hf_scheduler resumes with the correct states but slowly diverges")
 
         output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
         ds_config_dict = self.get_config_dict(stage)
@@ -1113,7 +1113,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     @require_torch_multi_accelerator
     def test_inference(self, dtype):
         if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest("test requires bfloat16 hardware support")
+            self.skipTest(reason="test requires bfloat16 hardware support")
 
         # this is just inference, so no optimizer should be loaded
         # it only works for z3 (makes no sense with z1-z2)
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index a35ea1a8e7e..9bf34c36692 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -80,7 +80,7 @@ class TestTrainerExt(TestCasePlus):
         logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
 
         if not do_eval:
-            return
+            self.skipTest(reason="do_eval is False")
 
         eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index f61adbbd906..3293cc279d0 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -463,9 +463,9 @@ class GenerationTesterMixin:
             config, input_ids, attention_mask = self._get_input_ids_and_config()
 
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
             if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest("Won't fix: model with non-standard dictionary output shapes")
+                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
 
             config.use_cache = True
             config.is_decoder = True
@@ -625,9 +625,9 @@ class GenerationTesterMixin:
             config, input_ids, attention_mask = self._get_input_ids_and_config()
 
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
             if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest("Won't fix: model with non-standard dictionary output shapes")
+                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
 
             model = model_class(config).to(torch_device).eval()
             logits_process_kwargs, _ = self._get_logits_processor_and_warper_kwargs(
@@ -667,7 +667,7 @@ class GenerationTesterMixin:
     def test_model_parallel_beam_search(self):
         for model_class in self.all_generative_model_classes:
             if "xpu" in torch_device:
-                return unittest.skip("device_map='auto' does not work with XPU devices")
+                return unittest.skip(reason="device_map='auto' does not work with XPU devices")
 
             if model_class._no_split_modules is None:
                 continue
@@ -765,7 +765,7 @@ class GenerationTesterMixin:
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
-            return
+            self.skipTest(reason="bos_token_id is None")
 
         # hack in case they are equal, otherwise the attn mask will be [0]
         if config.bos_token_id == config.pad_token_id:
@@ -982,17 +982,17 @@ class GenerationTesterMixin:
     def test_contrastive_generate(self):
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support contrastive search generation")
+                self.skipTest(reason="Stateful models don't support contrastive search generation")
 
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, input_ids, attention_mask = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
             config.use_cache = True
             config.is_decoder = True
 
@@ -1009,17 +1009,17 @@ class GenerationTesterMixin:
     def test_contrastive_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support contrastive search generation")
+                self.skipTest(reason="Stateful models don't support contrastive search generation")
 
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, input_ids, attention_mask = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
             config.use_cache = True
             config.is_decoder = True
 
@@ -1045,18 +1045,18 @@ class GenerationTesterMixin:
         # Check that choosing 'low_memory' does not change the model output
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support contrastive search generation")
+                self.skipTest(reason="Stateful models don't support contrastive search generation")
 
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
             if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
-                self.skipTest("TODO: fix me")
+                self.skipTest(reason="TODO: fix me")
 
             config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             config.use_cache = True
             config.is_decoder = True
@@ -1087,9 +1087,9 @@ class GenerationTesterMixin:
         # Check that choosing 'low_memory' does not change the model output
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("May fix in the future: need custom cache handling")
+                self.skipTest(reason="May fix in the future: need custom cache handling")
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
                 for model_name in [
@@ -1102,7 +1102,7 @@ class GenerationTesterMixin:
                     "jamba",
                 ]
             ):
-                self.skipTest("May fix in the future: need model-specific fixes")
+                self.skipTest(reason="May fix in the future: need model-specific fixes")
             config, input_ids, _ = self._get_input_ids_and_config(batch_size=2)
             # batch_size=1 is ok, but batch_size>1 will cause non-identical output
 
@@ -1135,9 +1135,9 @@ class GenerationTesterMixin:
 
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support assisted generation")
+                self.skipTest(reason="Stateful models don't support assisted generation")
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
                 for model_name in [
@@ -1151,14 +1151,14 @@ class GenerationTesterMixin:
                     "clvp",
                 ]
             ):
-                self.skipTest("May fix in the future: need model-specific fixes")
+                self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
             config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             config.use_cache = True
             config.is_decoder = True
@@ -1206,9 +1206,9 @@ class GenerationTesterMixin:
 
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support assisted generation")
+                self.skipTest(reason="Stateful models don't support assisted generation")
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
                 for model_name in [
@@ -1222,14 +1222,14 @@ class GenerationTesterMixin:
                     "clvp",
                 ]
             ):
-                self.skipTest("May fix in the future: need model-specific fixes")
+                self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
             config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             config.use_cache = True
             config.is_decoder = True
@@ -1268,9 +1268,9 @@ class GenerationTesterMixin:
         # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
         for model_class in self.all_generative_model_classes:
             if model_class._is_stateful:
-                self.skipTest("Stateful models don't support assisted generation")
+                self.skipTest(reason="Stateful models don't support assisted generation")
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
+                self.skipTest(reason="Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
                 for model_name in [
@@ -1284,14 +1284,14 @@ class GenerationTesterMixin:
                     "clvp",
                 ]
             ):
-                self.skipTest("May fix in the future: need model-specific fixes")
+                self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
             config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             config.use_cache = True
             config.is_decoder = True
@@ -1436,7 +1436,7 @@ class GenerationTesterMixin:
 
             # If it doesn't support cache, pass the test
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             model = model_class(config).to(torch_device)
             if "use_cache" not in inputs:
@@ -1445,7 +1445,7 @@ class GenerationTesterMixin:
 
             # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
             if "past_key_values" not in outputs:
-                self.skipTest("This model doesn't return `past_key_values`")
+                self.skipTest(reason="This model doesn't return `past_key_values`")
 
             num_hidden_layers = (
                 getattr(config, "decoder_layers", None)
@@ -1553,14 +1553,14 @@ class GenerationTesterMixin:
         # Tests that we can continue generating from past key values, returned from a previous `generate` call
         for model_class in self.all_generative_model_classes:
             if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
-                self.skipTest("Won't fix: old model with unique inputs/caches/other")
+                self.skipTest(reason="Won't fix: old model with unique inputs/caches/other")
             if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
-                self.skipTest("TODO: needs modeling or test input preparation fixes for compatibility")
+                self.skipTest(reason="TODO: needs modeling or test input preparation fixes for compatibility")
 
             config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
 
             if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                self.skipTest(reason="This model doesn't support caching")
 
             # Let's make it always:
             # 1. use cache (for obvious reasons)
@@ -1582,7 +1582,7 @@ class GenerationTesterMixin:
             # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
             outputs = model(**inputs)
             if "past_key_values" not in outputs:
-                self.skipTest("This model doesn't return `past_key_values`")
+                self.skipTest(reason="This model doesn't return `past_key_values`")
 
             # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
             outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)
@@ -1632,7 +1632,7 @@ class GenerationTesterMixin:
         # 👉 tests with and without sampling so we can cover the most common use cases.
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_cache_class:
-                self.skipTest("This model does not support the new cache format")
+                self.skipTest(reason="This model does not support the new cache format")
 
             config, input_ids, attention_mask = self._get_input_ids_and_config()
             config.use_cache = True
@@ -1689,7 +1689,7 @@ class GenerationTesterMixin:
     def test_generate_with_quant_cache(self):
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_quantized_cache:
-                self.skipTest("This model does not support the quantized cache format")
+                self.skipTest(reason="This model does not support the quantized cache format")
 
             config, input_ids, attention_mask = self._get_input_ids_and_config()
             config.use_cache = True
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index e3f39257a68..beb910b9d15 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -67,7 +67,7 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 50a29eb550c..35000db677d 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -23,7 +23,6 @@ import requests
 
 from transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig
 from transformers.testing_utils import (
-    is_flax_available,
     require_torch,
     require_vision,
     slow,
@@ -56,10 +55,6 @@ if is_vision_available():
     from PIL import Image
 
 
-if is_flax_available():
-    pass
-
-
 class AlignVisionModelTester:
     def __init__(
         self,
@@ -215,9 +210,11 @@ class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -355,9 +352,11 @@ class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -518,7 +517,7 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 4f258e4ddb2..83b6d60595d 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -178,9 +178,11 @@ class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -309,7 +311,7 @@ class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     # TODO (@SunMarc): Fix me
-    @unittest.skip("It's broken.")
+    @unittest.skip(reason="It's broken.")
     def test_resize_tokens_embeddings(self):
         super().test_resize_tokens_embeddings()
 
@@ -324,9 +326,11 @@ class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -487,7 +491,7 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 47e0beaeae8..9bb8ef33d75 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -754,7 +754,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
             with torch.no_grad():
                 model(**inputs)[0]
 
-    @unittest.skip("FineModel relies on codebook idx and does not return same logits")
+    @unittest.skip(reason="FineModel relies on codebook idx and does not return same logits")
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
@@ -826,7 +826,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
         # resizing tokens_embeddings of a ModuleList
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -877,7 +877,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
         # resizing tokens_embeddings of a ModuleList
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is False")
 
         original_config.tie_word_embeddings = False
 
@@ -931,7 +931,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     def test_flash_attn_2_inference_equivalence(self):
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(reason="Model does not support flash_attention_2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -988,7 +988,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(reason="Model does not support flash_attention_2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index ba9e112c186..a65ec043de8 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1515,9 +1515,10 @@ class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, un
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
 
+    @unittest.skip
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index f3a63d6d417..274312983f1 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -147,6 +147,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
             self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
             self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py
index b2b0c7b058d..c76435958c6 100644
--- a/tests/models/barthez/test_tokenization_barthez.py
+++ b/tests/models/barthez/test_tokenization_barthez.py
@@ -75,7 +75,7 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 0e3e3e32d27..ac64f0fd3b0 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -301,7 +301,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -325,7 +325,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config.use_cache = False
         config.return_dict = True
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 8b2dbc3634b..6ae9f6c279d 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -614,7 +614,7 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         for model_class in self.all_model_classes:
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == BertForMultipleChoice:
-                return
+                self.skipTest(reason="BertForMultipleChoice behaves incorrectly in JIT environments.")
 
             config.torchscript = True
             model = model_class(config=config)
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index 5cebf58029f..747b0cf2a73 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -79,7 +79,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index dd22eb4a6d2..7a7ad5071df 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -716,7 +716,7 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
         """
 
         if not self.test_attention_probs:
-            return
+            self.skip("test_attention_probs is set to False")
 
         model = BigBirdModel.from_pretrained(
             "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index 863d30e8499..25f8de17700 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -63,7 +63,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 5e4ce9f0bbf..357b91a41e5 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -335,14 +335,15 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
 
     def test_generate_without_input_ids(self):
         if self.model_tester.attention_type == "block_sparse":
-            # this test can never pass for BigBird-block-sparse attention since input_ids must be multiple of block_size
-            return
+            self.skipTest(
+                "Cannot pass for BigBird-block-sparse attention since input_ids must be multiple of block_size"
+            )
         super().test_generate_without_input_ids()
 
     def test_retain_grad_hidden_states_attentions(self):
         if self.model_tester.attention_type == "block_sparse":
             # this test can't pass since attention matrix (which is getting returned) can't have gradients (& just 0 at many locations)
-            return
+            self.skipTest(reason="Cannot pass since returned attention matrix can't have gradients")
         super().test_retain_grad_hidden_states_attentions()
 
     # BigBirdPegasusForSequenceClassification does not support inputs_embeds
@@ -811,6 +812,6 @@ class BigBirdPegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTeste
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip("Decoder cannot retain gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index 51e836a1f90..1ccb2b54cc9 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -414,7 +414,7 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @unittest.skip("The `input_embeds` when fed don't produce the same results.")
+    @unittest.skip(reason="The `input_embeds` when fed don't produce the same results.")
     def test_beam_sample_generate(self):
         pass
 
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 6c6a0185f39..fa0797cbeed 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -565,6 +565,6 @@ class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMix
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 6f28b5959c7..6be86a66b98 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -564,6 +564,6 @@ class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTest
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py
index 905e1dad55e..9be86359a1c 100644
--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -130,18 +130,18 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
         self.assertTrue(hasattr(image_processor, "image_std"))
         self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
 
-    @unittest.skip("BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_numpy(self):
         return super().test_call_numpy()
 
-    @unittest.skip("BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_pytorch(self):
         return super().test_call_torch()
 
-    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(reason="BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
     def test_call_pil(self):
         pass
 
-    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(reason="BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index db71336e346..2f8ee3229ff 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -193,9 +193,11 @@ class BlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -335,9 +337,11 @@ class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -491,7 +495,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -932,7 +936,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not setup for training")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -951,7 +955,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not setup for training")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1008,7 +1012,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -1160,7 +1164,7 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not setup for training")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1179,7 +1183,7 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not setup for training")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1224,7 +1228,7 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index ea6e138b7e7..85ab462a0d5 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -141,9 +141,11 @@ class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 9e295325b3f..28ed3a79cae 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -187,9 +187,11 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index d0ee36dc3ca..0952cfee3b7 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -389,7 +389,7 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
 
-    @unittest.skip("Bloom has a non-standard KV cache format.")
+    @unittest.skip(reason="Bloom has a non-standard KV cache format.")
     def test_past_key_values_format(self):
         pass
 
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 7a62f8f1a7f..a477d2538c7 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -43,7 +43,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         kwargs.update(self.special_tokens_map)
         return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 
-    @unittest.skip("This needs a slow tokenizer. Bloom does not have one!")
+    @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
     def test_encode_decode_with_spaces(self):
         return
 
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 0033021ea72..c3075beb506 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -300,15 +300,15 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 self.assertTrue(tokenizer.decode([255]) == "")
 
-    # tokenizer does not have vocabulary
+    @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary")
     def test_get_vocab(self):
         pass
 
-    # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
+    @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
     def test_pretokenized_inputs(self):
         pass
 
-    # tests all ids in vocab => vocab doesn't exist so unnecessary to test
+    @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary")
     def test_conversion_reversible(self):
         pass
 
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index 340d5fc4561..1ff43e359d5 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -94,7 +94,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 3e7b27638c2..efc70dff499 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -441,7 +441,7 @@ class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_headmasking(self):
         if not self.test_head_masking:
-            return
+            self.skipTest(reason="test_head_masking is set to False")
 
         global_rng.seed(42)
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -496,7 +496,7 @@ class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
             check_attentions_validity(outputs.attentions)
 
-    @unittest.skip("CANINE does not have a get_input_embeddings() method.")
+    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
     def test_inputs_embeds(self):
         # ViT does not use inputs_embeds
         pass
@@ -505,7 +505,7 @@ class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip("CANINE does not have a get_input_embeddings() method.")
+    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index d34ac324eac..e7e19c63ce9 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -303,31 +303,32 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [additional_special_token])
         self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [additional_special_token_id])
 
-    # tokenizer has a fixed vocab_size (namely all possible unicode code points)
+    @unittest.skip(reason="tokenizer has a fixed vocab_size (namely all possible unicode code points)")
     def test_add_tokens_tokenizer(self):
         pass
 
     # CanineTokenizer does not support do_lower_case = True, as each character has its own Unicode code point
     # ("b" and "B" for example have different Unicode code points)
+    @unittest.skip(reason="CanineTokenizer does not support do_lower_case = True")
     def test_added_tokens_do_lower_case(self):
         pass
 
-    # CanineModel does not support the get_input_embeddings nor the get_vocab method
+    @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
-    # CanineModel does not support the get_input_embeddings nor the get_vocab method
+    @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method")
     def test_torch_encode_plus_sent_to_model(self):
         pass
 
-    # tokenizer does not have vocabulary
+    @unittest.skip(reason="CanineTokenizer does not have vocabulary")
     def test_get_vocab(self):
         pass
 
-    # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
+    @unittest.skip(reason="inputs cannot be pretokenized since ids depend on whole input string")
     def test_pretokenized_inputs(self):
         pass
 
-    # tests all ids in vocab => vocab doesn't exist so unnecessary to test
+    @unittest.skip(reason="CanineTokenizer does not have vocabulary")
     def test_conversion_reversible(self):
         pass
diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
index 94e41e8eaa0..168f84e9842 100644
--- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
@@ -26,10 +26,6 @@ if is_vision_available():
     from transformers import ChineseCLIPImageProcessor
 
 
-if is_torch_available():
-    pass
-
-
 class ChineseCLIPImageProcessingTester(unittest.TestCase):
     def __init__(
         self,
@@ -125,7 +121,9 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
         self.assertEqual(image_processor.size, {"shortest_edge": 42})
         self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
 
-    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(
+        reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
 
@@ -155,14 +153,16 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
         self.assertTrue(hasattr(image_processing, "image_std"))
         self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
 
-    @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_numpy(self):
         return super().test_call_numpy()
 
-    @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_pytorch(self):
         return super().test_call_torch()
 
-    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(
+        reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 17d8ddcb1c4..7046f28b5f9 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -388,9 +388,11 @@ class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
         model = ChineseCLIPTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -466,9 +468,11 @@ class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -621,7 +625,7 @@ class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 7cb558b97a9..8e3392133f1 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -562,7 +562,7 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 5221274ffae..78a5fb6f9ad 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -220,9 +220,11 @@ class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -381,9 +383,11 @@ class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -535,7 +539,7 @@ class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -636,7 +640,7 @@ class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 fx_model_class = getattr(transformers, fx_model_class_name)
 
@@ -692,8 +696,7 @@ class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 fx_model_class = getattr(transformers, fx_model_class_name)
 
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index 5885f8933c1..c24f554a078 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -178,7 +178,6 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         super().test_tokenization_python_rust_equals()
 
-    # overwrite common test
+    @unittest.skip(reason="CLIP always lower cases letters")
     def test_added_tokens_do_lower_case(self):
-        # CLIP always lower cases letters
         pass
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index e9bfd2201e8..a6f286c4c6b 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -194,9 +194,11 @@ class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -331,9 +333,11 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -540,7 +544,7 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -641,7 +645,7 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 fx_model_class = getattr(transformers, fx_model_class_name)
 
@@ -697,8 +701,7 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 fx_model_class = getattr(transformers, fx_model_class_name)
 
@@ -744,7 +747,7 @@ class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="Training test is skipped as the model was not trained")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py
index 7bb522f4144..71ea9c08c83 100644
--- a/tests/models/clvp/test_tokenization_clvp.py
+++ b/tests/models/clvp/test_tokenization_clvp.py
@@ -102,7 +102,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index fd4b38a17ec..ee07c54c16a 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -26,7 +26,6 @@ from transformers import (
     AddedToken,
     CodeLlamaTokenizer,
     CodeLlamaTokenizerFast,
-    is_torch_available,
 )
 from transformers.convert_slow_tokenizer import convert_slow_tokenizer
 from transformers.testing_utils import (
@@ -44,10 +43,6 @@ from ...test_tokenization_common import TokenizerTesterMixin
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
 
-if is_torch_available():
-    pass
-
-
 @require_sentencepiece
 @require_tokenizers
 class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -220,7 +215,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     @require_torch
     def test_batch_tokenization(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -240,7 +235,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         return_tensors="pt",
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 # max_target_length will default to max_length if not specified
                 batch = tokenizer(text, max_length=3, return_tensors="pt")
@@ -251,7 +246,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
     def test_save_slow_from_fast_and_reload_fast(self):
         pass
 
@@ -306,11 +301,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             pickled_tokenizer = pickle.dumps(tokenizer)
         pickle.loads(pickled_tokenizer)
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_pickle_subword_regularization_tokenizer(self):
         pass
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_subword_regularization_tokenizer(self):
         pass
 
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index e7945089c07..4832bf1962e 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -99,7 +99,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
@@ -127,6 +127,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+    @unittest.skip
     def test_pretokenized_inputs(self, *args, **kwargs):
         # It's very difficult to mix/test pretokenization with byte-level
         # And get both CodeGen and Roberta to work at the same time (mostly an issue of adding a space before the string)
@@ -262,6 +263,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR
 
     # tokenizer has no padding token
+    @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
 
diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py
index d1caf041cd9..a8ab85fe3b8 100644
--- a/tests/models/cohere/test_tokenization_cohere.py
+++ b/tests/models/cohere/test_tokenization_cohere.py
@@ -51,7 +51,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_torch_encode_plus_sent_to_model(self):
         super().test_torch_encode_plus_sent_to_model()
 
-    @unittest.skip("This needs a slow tokenizer. Cohere does not have one!")
+    @unittest.skip(reason="This needs a slow tokenizer. Cohere does not have one!")
     def test_encode_decode_with_spaces(self):
         return
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 18f85a71e07..2e2973679e9 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -263,8 +263,8 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
         pass
 
     @slow
+    @unittest.skip(reason="TODO Niels: fix me!")
     def test_model_outputs_equivalence(self):
-        # TODO Niels: fix me!
         pass
 
     def test_attention_outputs(self):
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index 0866f767987..84b50f57290 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -433,7 +433,7 @@ class ConvBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         for model_class in self.all_model_classes:
             # ConvBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == ConvBertForMultipleChoice:
-                return
+                self.skipTest(reason="ConvBertForMultipleChoice behaves incorrectly in JIT environments.")
 
             config.torchscript = True
             model = model_class(config=config)
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index ba3878ba51e..e5bb8e3d190 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -216,7 +216,7 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not set to test training")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
@@ -237,7 +237,7 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not set to test training")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index 64ee96b1e8e..404280428ef 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -154,7 +154,7 @@ class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         self.config_tester.run_common_tests()
 
     def test_inputs_embeds(self):
-        unittest.skip("CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds)
+        unittest.skip(reason="CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds)
 
     def test_retain_grad_hidden_states_attentions(self):
         unittest.skip(
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index 8e9fb0d82fd..8bb16760ce6 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -426,22 +426,19 @@ class Data2VecAudioModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Data2VecAudio has no inputs_embeds
+    @unittest.skip(reason="Data2VecAudio has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="`input_ids` is renamed to `input_values`")
     def test_forward_signature(self):
         pass
 
-    # Data2VecAudio cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Data2VecAudio has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Data2VecAudio has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Data2VecAudio has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index 8f8a1fad447..c729d88d614 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -196,8 +196,8 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(reason="Data2VecVision does not use inputs_embeds")
     def test_inputs_embeds(self):
-        # Data2VecVision does not use inputs_embeds
         pass
 
     @require_torch_multi_gpu
@@ -226,7 +226,7 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -245,7 +245,7 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
     def test_training_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config.use_cache = False
         config.return_dict = True
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 31031c8f7af..06c82c949cb 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -350,21 +350,21 @@ class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         model = DbrxModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("Dbrx models have weight tying disabled.")
+    @unittest.skip(reason="Dbrx models have weight tying disabled.")
     def test_tied_weights_keys(self):
         pass
 
     # Offload does not work with Dbrx models because of the forward of DbrxExperts where we chunk the experts.
     # The issue is that the offloaded weights of the mlp layer are still on meta device (w1_chunked, v1_chunked, w2_chunked)
-    @unittest.skip("Dbrx models do not work with offload")
+    @unittest.skip(reason="Dbrx models do not work with offload")
     def test_cpu_offload(self):
         pass
 
-    @unittest.skip("Dbrx models do not work with offload")
+    @unittest.skip(reason="Dbrx models do not work with offload")
     def test_disk_offload_safetensors(self):
         pass
 
-    @unittest.skip("Dbrx models do not work with offload")
+    @unittest.skip(reason="Dbrx models do not work with offload")
     def test_disk_offload_bin(self):
         pass
 
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index 8a0085986a5..da59fa28292 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -79,11 +79,11 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
         self.assertListEqual(rust_tokens, tokens_target)
 
-    @unittest.skip("There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
+    @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
     def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
         pass
 
-    @unittest.skip("There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
+    @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
     def test_sentencepiece_tokenize_and_decode(self):
         pass
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index f648e28f1da..b77ffb6e777 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -606,15 +606,15 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index daf13d207b0..1b4ca6e206a 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -274,7 +274,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -296,7 +296,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config.use_cache = False
         config.return_dict = True
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index be75423827a..d1e36e32824 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -263,8 +263,8 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         pass
 
     @slow
+    @unittest.skip(reason="TODO Niels: fix me!")
     def test_model_outputs_equivalence(self):
-        # TODO Niels: fix me!
         pass
 
     def test_attention_outputs(self):
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index dcebd82aa91..7cfb5846e07 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -256,7 +256,7 @@ class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     def test_attention_outputs(self):
-        self.skipTest("Dinat's attention operation is handled entirely by NATTEN.")
+        self.skipTest(reason="Dinat's attention operation is handled entirely by NATTEN.")
 
     def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
         model = model_class(config)
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 6bd821859ea..cde65080d2d 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -281,7 +281,7 @@ class DistilBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         for model_class in self.all_model_classes:
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == DistilBertForMultipleChoice:
-                return
+                self.skipTest(reason="DistilBertForMultipleChoice behaves incorrectly in JIT environments.")
 
             config.torchscript = True
             model = model_class(config=config)
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index 5a47856afed..11c01c39fa6 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -168,8 +168,8 @@ class DonutSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip(reason="DonutSwin does not use inputs_embeds")
     def test_inputs_embeds(self):
-        # DonutSwin does not use inputs_embeds
         pass
 
     def test_model_get_set_embeddings(self):
diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
index f3648e9863a..2a9c47b93c2 100644
--- a/tests/models/electra/test_tokenization_electra.py
+++ b/tests/models/electra/test_tokenization_electra.py
@@ -78,7 +78,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index f720327ec71..e4f66d85641 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -178,29 +178,35 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             expected_arg_names = ["input_values", "padding_mask", "bandwidth"]
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    @unittest.skip(
+        reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic"
+    )
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    @unittest.skip(
+        reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic"
+    )
     def test_torchscript_output_attentions(self):
         pass
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    @unittest.skip(
+        reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic"
+    )
     def test_torchscript_output_hidden_state(self):
         pass
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -288,7 +294,9 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             # (Even with this call, there are still memory leak by ~0.04MB)
             self.clear_torch_jit_class_registry()
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    @unittest.skip(
+        reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic"
+    )
     def test_attention_outputs(self):
         pass
 
@@ -321,19 +329,21 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
             hidden_states_with_chunk = model(**inputs)[0]
             self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
 
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    @unittest.skip(
+        reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic"
+    )
     def test_hidden_states_output(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 63bb11ee57e..5e5263b6afb 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -1005,6 +1005,7 @@ class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
             "google-bert/bert-base-cased", "openai-community/gpt2"
         )
 
+    @unittest.skip
     def test_encoder_decoder_model_shared_weights(self):
         pass
 
@@ -1079,6 +1080,7 @@ class ProphetNetEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
             "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
         )
 
+    @unittest.skip
     def test_encoder_decoder_model_shared_weights(self):
         pass
 
@@ -1135,6 +1137,7 @@ class BartEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
             "google-bert/bert-large-uncased", "facebook/bart-large"
         )
 
+    @unittest.skip
     def test_encoder_decoder_model_shared_weights(self):
         pass
 
diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py
index da19d08e466..232d9176034 100644
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@@ -577,9 +577,8 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-            # ErnieForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == ErnieForMultipleChoice:
-                return
+                self.skipTest(reason="ErnieForMultipleChoice behaves incorrectly in JIT environments.")
 
             config.torchscript = True
             model = model_class(config=config)
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 3171264e254..56a7e4d0c67 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -290,11 +290,11 @@ class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         self.assertEqual(position_ids.shape, expected_positions.shape)
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
-    @unittest.skip("Esm does not support embedding resizing")
+    @unittest.skip(reason="Esm does not support embedding resizing")
     def test_resize_embeddings_untied(self):
         pass
 
-    @unittest.skip("Esm does not support embedding resizing")
+    @unittest.skip(reason="Esm does not support embedding resizing")
     def test_resize_tokens_embeddings(self):
         pass
 
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 11306e73686..5c05efb03f2 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -184,7 +184,7 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("Does not support attention outputs")
+    @unittest.skip(reason="Does not support attention outputs")
     def test_attention_outputs(self):
         pass
 
@@ -192,75 +192,77 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     def test_correct_missing_keys(self):
         pass
 
-    @unittest.skip("Esm does not support embedding resizing")
+    @unittest.skip(reason="Esm does not support embedding resizing")
     def test_resize_embeddings_untied(self):
         pass
 
-    @unittest.skip("Esm does not support embedding resizing")
+    @unittest.skip(reason="Esm does not support embedding resizing")
     def test_resize_tokens_embeddings(self):
         pass
 
-    @unittest.skip("ESMFold does not support passing input embeds!")
+    @unittest.skip(reason="ESMFold does not support passing input embeds!")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("ESMFold does not support head pruning.")
+    @unittest.skip(reason="ESMFold does not support head pruning.")
     def test_head_pruning(self):
         pass
 
-    @unittest.skip("ESMFold does not support head pruning.")
+    @unittest.skip(reason="ESMFold does not support head pruning.")
     def test_head_pruning_integration(self):
         pass
 
-    @unittest.skip("ESMFold does not support head pruning.")
+    @unittest.skip(reason="ESMFold does not support head pruning.")
     def test_head_pruning_save_load_from_config_init(self):
         pass
 
-    @unittest.skip("ESMFold does not support head pruning.")
+    @unittest.skip(reason="ESMFold does not support head pruning.")
     def test_head_pruning_save_load_from_pretrained(self):
         pass
 
-    @unittest.skip("ESMFold does not support head pruning.")
+    @unittest.skip(reason="ESMFold does not support head pruning.")
     def test_headmasking(self):
         pass
 
-    @unittest.skip("ESMFold does not output hidden states in the normal way.")
+    @unittest.skip(reason="ESMFold does not output hidden states in the normal way.")
     def test_hidden_states_output(self):
         pass
 
-    @unittest.skip("ESMfold does not output hidden states in the normal way.")
+    @unittest.skip(reason="ESMfold does not output hidden states in the normal way.")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip("ESMFold only has one output format.")
+    @unittest.skip(reason="ESMFold only has one output format.")
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip("This test doesn't work for ESMFold and doesn't test core functionality")
+    @unittest.skip(reason="This test doesn't work for ESMFold and doesn't test core functionality")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("ESMFold does not support input chunking.")
+    @unittest.skip(reason="ESMFold does not support input chunking.")
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip("ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments.")
+    @unittest.skip(
+        reason="ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments."
+    )
     def test_initialization(self):
         pass
 
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
+    @unittest.skip(reason="ESMFold doesn't support torchscript compilation.")
     def test_torchscript_output_attentions(self):
         pass
 
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
+    @unittest.skip(reason="ESMFold doesn't support torchscript compilation.")
     def test_torchscript_output_hidden_state(self):
         pass
 
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
+    @unittest.skip(reason="ESMFold doesn't support torchscript compilation.")
     def test_torchscript_simple(self):
         pass
 
-    @unittest.skip("ESMFold doesn't support data parallel.")
+    @unittest.skip(reason="ESMFold doesn't support data parallel.")
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 50e8fcdbb4b..2fb9e664c7b 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -381,7 +381,7 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
             # If it doesn't support cache, pass the test
             if not hasattr(config, "use_cache"):
-                return
+                self.skipTest(reason="Model does not support cache")
 
             model = model_class(config).to(torch_device)
             if "use_cache" not in inputs:
@@ -390,7 +390,7 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
             # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
             if "past_key_values" not in outputs:
-                return
+                self.skipTest(reason="Model does not return past_key_values")
 
             num_hidden_layers = (
                 getattr(config, "decoder_layers", None)
diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
index 119e35555a8..72acb83999b 100644
--- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
@@ -174,7 +174,7 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase)
     def test_convert_tokens_to_string_format(self):
         pass
 
-    @unittest.skip("FastSpeech2Conformer tokenizer does not support pairs.")
+    @unittest.skip(reason="FastSpeech2Conformer tokenizer does not support pairs.")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index de0fd88db46..17502dc2735 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -477,7 +477,7 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         for model_class in self.all_model_classes:
             # FlauBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == FlaubertForMultipleChoice:
-                return
+                self.skipTest(reason="FlauBertForMultipleChoice behaves incorrectly in JIT environments.")
 
             config.torchscript = True
             model = model_class(config=config)
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 388e2f041f2..d8c8f385e9c 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -176,8 +176,8 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip("Flava does not use input_ids")
     def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
         pass
 
     def test_model_get_set_embeddings(self):
@@ -300,9 +300,11 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -318,13 +320,13 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # skip this test as FlavaImageModel has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
     # skip this test as FlavaImageModel has no base class and is
     # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
@@ -459,9 +461,11 @@ class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -477,17 +481,16 @@ class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip(reason="FLAVA does not use input_embeds")
     def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
-    # skip this test as FlavaTextModel has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # skip this test as FlavaTextModel has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
@@ -619,13 +622,15 @@ class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
             expected_arg_names = ["hidden_states"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
+    @unittest.skip("FLAVA does not have input embeddings")
     def test_model_get_set_embeddings(self):
-        # No embedding in multimodal model
         pass
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -641,17 +646,15 @@ class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip(reason="FLAVA does not use input_embeds")
     def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
         pass
 
-    # skip this test as FlavaMultimodalModel has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # skip this test as FlavaMultimodalModel has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
@@ -742,20 +745,23 @@ class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
     def test_attention_outputs(self):
         pass
 
+    @unittest.skip(reason="No embedding in multimodal model")
     def test_model_get_set_embeddings(self):
-        # No embedding in multimodal model
         pass
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_hidden_states_output(self):
         pass
 
+    @unittest.skip(reason="FlavaImageCodebook has no attentions")
     def test_retain_grad_hidden_states_attentions(self):
-        # no attentions
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -771,20 +777,19 @@ class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip(reason="FLAVA does not use input_embeds")
     def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
         pass
 
+    @unittest.skip
     def test_model_outputs_equivalence(self):
         pass
 
-    # skip this test as FlavaImageCodebook has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # skip this test as FlavaImageCodebook has no base class and is
-    # not available in MODEL_MAPPING
+    @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_to_base(self):
         pass
 
@@ -931,19 +936,19 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    # hidden_states are tested in individual model tests
+    @unittest.skip(reason="tested in individual model tests")
     def test_hidden_states_output(self):
         pass
 
-    # input_embeds are tested in individual model tests
+    @unittest.skip(reason="tested in individual model tests")
     def test_inputs_embeds(self):
         pass
 
-    # tested in individual model tests
+    @unittest.skip(reason="tested in individual model tests")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    # FlavaModel does not have input/output embeddings
+    @unittest.skip(reason="FlavaModel does not have input/output embeddings")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -973,7 +978,7 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 8686c60ab69..b7acf3610c0 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -321,6 +321,7 @@ class FNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         return inputs_dict
 
     # Overriden Tests
+    @unittest.skip
     def test_attention_outputs(self):
         pass
 
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
index a3492cf966c..16f2e4950ef 100644
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ b/tests/models/fnet/test_tokenization_fnet.py
@@ -69,7 +69,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -194,7 +194,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding(self, max_length=50):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
index c3ad05e300f..af95e0dca89 100644
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@@ -263,7 +263,7 @@ class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         model = FSMTModel(config).to(torch_device)
@@ -312,23 +312,23 @@ class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             2,
         )
 
-    @unittest.skip("can't be implemented for FSMT due to dual vocab.")
+    @unittest.skip(reason="can't be implemented for FSMT due to dual vocab.")
     def test_resize_tokens_embeddings(self):
         pass
 
-    @unittest.skip("Passing inputs_embeds not implemented for FSMT.")
+    @unittest.skip(reason="Passing inputs_embeds not implemented for FSMT.")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("Input ids is required for FSMT.")
+    @unittest.skip(reason="Input ids is required for FSMT.")
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip("model weights aren't tied in FSMT.")
+    @unittest.skip(reason="model weights aren't tied in FSMT.")
     def test_tie_model_weights(self):
         pass
 
-    @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment")
+    @unittest.skip(reason="TODO: Decoder embeddings cannot be resized at the moment")
     def test_resize_embeddings_untied(self):
         pass
 
@@ -582,7 +582,7 @@ class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
         # odd num_embeddings is allowed
         SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(torch_device)
 
-    @unittest.skip("different from marian (needs more research)")
+    @unittest.skip(reason="different from marian (needs more research)")
     def test_positional_emb_weights_against_marian(self):
         desired_weights = torch.tensor(
             [
diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py
index 4be15cbee13..bac487767ba 100644
--- a/tests/models/fsmt/test_tokenization_fsmt.py
+++ b/tests/models/fsmt/test_tokenization_fsmt.py
@@ -160,10 +160,10 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         expected = ["us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>"]
         self.assertListEqual(tokens, expected)
 
-    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
+    @unittest.skip(reason="FSMTConfig.__init__  requires non-optional args")
     def test_torch_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
+    @unittest.skip(reason="FSMTConfig.__init__  requires non-optional args")
     def test_np_encode_plus_sent_to_model(self):
         pass
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index f65498af33b..6065251c5bb 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -295,17 +295,17 @@ class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         pass
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model.")
+    @unittest.skip(reason="Does not work on the tiny model.")
     def test_disk_offload_bin(self):
         super().test_disk_offload()
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model.")
+    @unittest.skip(reason="Does not work on the tiny model.")
     def test_disk_offload_safetensors(self):
         super().test_disk_offload()
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model.")
+    @unittest.skip(reason="Does not work on the tiny model.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index 445a48e45a9..bdf01277437 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -398,11 +398,11 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Gemma buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Gemma buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Gemma uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Gemma uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -456,7 +456,7 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Gemma flash attention does not support right padding")
+        self.skipTest(reason="Gemma flash attention does not support right padding")
 
     @require_torch_sdpa
     @require_torch_gpu
@@ -464,7 +464,7 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     def test_sdpa_equivalence(self):
         for model_class in self.all_model_classes:
             if not model_class._supports_sdpa:
-                return
+                self.skipTest(reason="Model does not support SDPA")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -498,7 +498,7 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     def test_flash_attn_2_equivalence(self):
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(reason="Model does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -749,7 +749,7 @@ class GemmaIntegrationTest(unittest.TestCase):
 
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
-    @unittest.skip("The test will not fit our CI runners")
+    @unittest.skip(reason="The test will not fit our CI runners")
     @require_read_token
     def test_model_7b_fp32(self):
         model_id = "google/gemma-7b"
@@ -877,7 +877,7 @@ class GemmaIntegrationTest(unittest.TestCase):
         # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
         # work as intended. See https://github.com/pytorch/pytorch/issues/121943
         if version.parse(torch.__version__) < version.parse("2.3.0"):
-            self.skipTest("This test requires torch >= 2.3 to run.")
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
 
         NUM_TOKENS_TO_GENERATE = 40
         # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index c322e6174b4..4201e31e6f5 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -23,7 +23,6 @@ from transformers import (
     AddedToken,
     GemmaTokenizer,
     GemmaTokenizerFast,
-    is_torch_available,
 )
 from transformers.convert_slow_tokenizer import convert_slow_tokenizer
 from transformers.testing_utils import (
@@ -43,10 +42,6 @@ from ...test_tokenization_common import TokenizerTesterMixin
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
 
-if is_torch_available():
-    pass
-
-
 @require_sentencepiece
 @require_tokenizers
 class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -68,7 +63,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     @require_torch
     def test_batch_tokenization(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -88,7 +83,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         return_tensors="pt",
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 # max_target_length will default to max_length if not specified
                 batch = tokenizer(text, max_length=3, return_tensors="pt")
@@ -99,7 +94,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
     def test_save_slow_from_fast_and_reload_fast(self):
         pass
 
@@ -147,15 +142,15 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             padding=False,
         )
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_pickle_subword_regularization_tokenizer(self):
         pass
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_subword_regularization_tokenizer(self):
         pass
 
-    @unittest.skip("Skipping")
+    @unittest.skip(reason="Skipping")
     def test_torch_encode_plus_sent_to_model(self):
         pass
 
@@ -227,7 +222,7 @@ class GemmaIntegrationTest(unittest.TestCase):
         self.tokenizer.add_eos_token = False
         self.rust_tokenizer.add_eos_token = False
 
-    @unittest.skip("Not super important and always failing. Let's skip it")
+    @unittest.skip(reason="Not super important and always failing. Let's skip it")
     @slow
     def test_conversion(self):
         # This is excruciatingly slow since it has to recreate the entire merge
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index ee2cf4927a5..a9c94f54f1f 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -167,9 +167,11 @@ class GitVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index 151162fb1bb..81e95ab244f 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -168,11 +168,11 @@ class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
 
-    @unittest.skip("GLPN does not use inputs_embeds")
+    @unittest.skip(reason="GLPN does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("GLPN does not have get_input_embeddings method and get_output_embeddings methods")
+    @unittest.skip(reason="GLPN does not have get_input_embeddings method and get_output_embeddings methods")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -283,7 +283,7 @@ class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index 1e7c81e4be2..9d13822ac64 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -98,7 +98,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
@@ -126,6 +126,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+    @unittest.skip
     def test_pretokenized_inputs(self, *args, **kwargs):
         # It's very difficult to mix/test pretokenization with byte-level
         # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string)
@@ -247,7 +248,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(decode_s.startswith(bos_token))
         self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
 
-    # tokenizer has no padding token
+    @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
 
@@ -331,7 +332,7 @@ class OPTTokenizationTest(unittest.TestCase):
         # Same as above
         self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758])
 
-    @unittest.skip("This test is failing because of a bug in the fast tokenizer")
+    @unittest.skip(reason="This test is failing because of a bug in the fast tokenizer")
     def test_users_can_modify_bos(self):
         tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True)
 
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 3d4dd27fa47..cb1545c7fb5 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -458,27 +458,27 @@ class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip("MQA models does not support retain_grad")
+    @unittest.skip(reason="MQA models does not support retain_grad")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip("Contrastive search not supported due to non-standard caching mechanism")
+    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
     def test_contrastive_generate(self):
         pass
 
-    @unittest.skip("Contrastive search not supported due to non-standard caching mechanism")
+    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
     def test_contrastive_generate_dict_outputs_use_cache(self):
         pass
 
-    @unittest.skip("CPU offload seems to be broken for some reason - tiny models keep hitting corner cases")
+    @unittest.skip(reason="CPU offload seems to be broken for some reason - tiny models keep hitting corner cases")
     def test_cpu_offload(self):
         pass
 
-    @unittest.skip("Disk offload seems to be broken for some reason - tiny models keep hitting corner cases")
+    @unittest.skip(reason="Disk offload seems to be broken for some reason - tiny models keep hitting corner cases")
     def test_disk_offload(self):
         pass
 
-    @unittest.skip("BigCodeGPT has a non-standard KV cache format.")
+    @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format.")
     def test_past_key_values_format(self):
         pass
 
diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
index ec505da4a00..029c8b99d44 100644
--- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
@@ -128,10 +128,11 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         assert encoded_sentence == ids_1
         assert encoded_pair == ids_1 + ids_2
 
+    @unittest.skip
     def test_conversion_reversible(self):
         # Intentionally convert some words to accommodate character fluctuations unique to Japanese
         pass
 
+    @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
-        # tokenizer has no padding token
         pass
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 74c07b775bc..ce31bc44a61 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -262,9 +262,11 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
                     ],
                 )
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -458,9 +460,11 @@ class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -618,7 +622,7 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py
index b8bbd777581..02b2c54a2f0 100644
--- a/tests/models/herbert/test_tokenization_herbert.py
+++ b/tests/models/herbert/test_tokenization_herbert.py
@@ -95,7 +95,7 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index b040c57082f..cd801be41d7 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -350,22 +350,21 @@ class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Hubert has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Hubert has no inputs_embeds")
     def test_forward_signature(self):
         pass
 
     # Hubert cannot resize token embeddings
     # since it has no tokens embeddings
+    @unittest.skip(reason="Hubert has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Hubert has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Hubert has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -438,10 +437,10 @@ class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         # TODO: fix it
-        self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
+        self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
 
         if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            self.skipTest(reason="torch fx is not available or not compatible with this model")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.return_dict = False
@@ -615,22 +614,19 @@ class HubertRobustModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Hubert has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Hubert has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Hubert cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Hubert has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Hubert has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Hubert has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py
index 342d8175455..b9b5054d904 100644
--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -379,7 +379,7 @@ class IBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             with torch.no_grad():
                 model(**inputs)[0]
 
-    @unittest.skip("ibert overrides scaling to None if inputs_embeds")
+    @unittest.skip(reason="ibert overrides scaling to None if inputs_embeds")
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
index cb2b294bf54..2f7a8993df5 100644
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -191,18 +191,18 @@ class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
 
         torch.testing.assert_close(pixel_values_transform_implied, pixel_values_transform_supplied, rtol=0.0, atol=0.0)
 
-    @unittest.skip("not supported")
+    @unittest.skip(reason="not supported")
     def test_call_numpy(self):
         pass
 
-    @unittest.skip("not supported")
+    @unittest.skip(reason="not supported")
     def test_call_numpy_4_channels(self):
         pass
 
-    @unittest.skip("not supported")
+    @unittest.skip(reason="not supported")
     def test_call_pil(self):
         pass
 
-    @unittest.skip("not supported")
+    @unittest.skip(reason="not supported")
     def test_call_pytorch(self):
         pass
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 91a9f661660..0197ebcaff5 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -316,7 +316,7 @@ class IdeficsModelTester:
     @slow
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest("Idefics has a hard requirement on SDPA, skipping this test")
+        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
 
 
 @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
@@ -422,13 +422,13 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes:
             # IdeficsModel does not support training, users should use
             # IdeficsForVisionText2Text for this purpose
             if model_class == IdeficsModel:
-                return
+                self.skipTest(reason="IdeficsModel does not support training")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.return_dict = True
@@ -442,13 +442,13 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes:
             # IdeficsModel does not support training, users should use
             # IdeficsForVisionText2Text for this purpose
             if model_class == IdeficsModel:
-                return
+                self.skipTest(reason="IdeficsModel does not support training")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             config.use_cache = False
@@ -575,7 +575,7 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     @slow
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest("Idefics has a hard requirement on SDPA, skipping this test")
+        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
 
 
 @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
@@ -590,11 +590,11 @@ class IdeficsForVisionText2TextTest(IdeficsModelTest, unittest.TestCase):
         )
         self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
 
-    @unittest.skip("We only test the model that takes in multiple images")
+    @unittest.skip(reason="We only test the model that takes in multiple images")
     def test_model(self):
         pass
 
-    @unittest.skip("We only test the model that takes in multiple images")
+    @unittest.skip(reason="We only test the model that takes in multiple images")
     def test_for_token_classification(self):
         pass
 
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 63e6316773b..057ce93cd87 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -176,19 +176,19 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
         self.model_tester = Idefics2VisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
 
-    @unittest.skip("input_embeds cannot be passed in without input_ids")
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
         pass
 
-    @unittest.skip("input_embeds cannot be passed in without input_ids")
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip("Model does not support padding right")
+    @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_generate_padding_right(self):
         pass
 
-    @unittest.skip("Model does not support padding right")
+    @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
@@ -336,15 +336,15 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
         self.model_tester = Idefics2VisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False)
 
-    @unittest.skip("input_embeds cannot be passed in without input_ids")
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
     def test_inputs_embeds():
         pass
 
-    @unittest.skip("Model does not support padding right")
+    @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_generate_padding_right(self):
         pass
 
-    @unittest.skip("Model does not support padding right")
+    @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index a9dbc636ef3..aa1039103e9 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -176,7 +176,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 else:
                     self.assertEqual(image_processor_first[key], value)
 
-    @unittest.skip("ImageGPT requires clusters at initialization")
+    @unittest.skip(reason="ImageGPT requires clusters at initialization")
     def test_init_without_params(self):
         pass
 
@@ -220,7 +220,7 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
             tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
         )
 
-    @unittest.skip("ImageGPT assumes clusters for 3 channels")
+    @unittest.skip(reason="ImageGPT assumes clusters for 3 channels")
     def test_call_numpy_4_channels(self):
         pass
 
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index afb5ce87764..9cf45a3f21b 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -357,7 +357,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -404,13 +404,13 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="tie_word_embeddings is set to False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -493,7 +493,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -573,7 +573,7 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
 
             self.assertTrue(models_equal)
 
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
     def test_left_padding_compatibility(self):
         pass
 
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 5eab89a3adb..e4e86fb6952 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -278,17 +278,19 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # Ignore since we have no tokens embeddings
+    @unittest.skip(reason="Informer does not have tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
+    @unittest.skip
     def test_model_outputs_equivalence(self):
         pass
 
+    @unittest.skip
     def test_determinism(self):
         pass
 
-    @unittest.skip("randomly selects U keys while calculating attentions")
+    @unittest.skip(reason="randomly selects U keys while calculating attentions")
     def test_batching_equivalence(self):
         pass
 
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index f69eb0d806b..1688c685e1d 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -390,7 +390,7 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         Overriding the test_mismatched_shapes_have_properly_initialized_weights test because A_log and D params of the
         Mamba block are initialized differently and we tested that in test_initialization
         """
-        self.skipTest("Cumbersome and redundant for Jamba")
+        self.skipTest(reason="Cumbersome and redundant for Jamba")
 
     def test_attention_outputs(self):
         r"""
@@ -638,9 +638,9 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         Overriding the test_flash_attn_2_inference_padding_right test as the Jamba model, like Mixtral, doesn't support
         right padding + use cache with FA2
         """
-        self.skipTest("Jamba flash attention does not support right padding")
+        self.skipTest(reason="Jamba flash attention does not support right padding")
 
-    @unittest.skip("Jamba has its own special cache type")
+    @unittest.skip(reason="Jamba has its own special cache type")
     @parameterized.expand([(1, False), (1, True), (4, False)])
     def test_new_cache_format(self, num_beams, do_sample):
         pass
diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
index 12e5dd682c6..cdb82cb5a95 100644
--- a/tests/models/jetmoe/test_modeling_jetmoe.py
+++ b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -378,11 +378,11 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @unittest.skip("JetMoe buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="JetMoe buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("JetMoe uses MoA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="JetMoe uses MoA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -470,7 +470,7 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("JetMoe flash attention does not support right padding")
+        self.skipTest(reason="JetMoe flash attention does not support right padding")
 
 
 @require_torch
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 66f070ed462..6f34689004e 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -375,7 +375,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     # overwrite from common in order to use `config.text_config.vocab_size` instead of `config.vocab_size`
     def test_tie_model_weights(self):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -429,7 +429,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
index 3ddd6e76603..eb0e1de626a 100644
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ b/tests/models/layoutlm/test_tokenization_layoutlm.py
@@ -69,6 +69,7 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
+    @unittest.skip
     def test_special_tokens_as_you_expect(self):
         """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids"""
         pass
diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
index 5e213e0a364..09dabfc5bed 100644
--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -96,7 +96,7 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
         self.assertEqual(image_processor.size, {"height": 42, "width": 42})
 
-    @unittest.skip("Tesseract version is not correct in ci. @Arthur FIXME")
+    @unittest.skip(reason="Tesseract version is not correct in ci. @Arthur FIXME")
     def test_layoutlmv2_integration_test(self):
         # with apply_OCR = True
         image_processing = LayoutLMv2ImageProcessor()
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 284ba82e3ff..3d366fe3e84 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -414,7 +414,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("We cannot configure detectron2 to output a smaller backbone")
+    @unittest.skip(reason="We cannot configure detectron2 to output a smaller backbone")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 9f9a86a999d..0dbeef0c417 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -195,7 +195,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
         )
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -385,11 +385,11 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
                 self.assertIn(decoded, [output, output.lower()])
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_split_special_tokens(self):
         pass
 
@@ -814,7 +814,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         words, boxes = self.get_words_and_boxes_batch()
 
@@ -835,7 +835,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
 
@@ -942,7 +942,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -1003,7 +1003,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     words, boxes = self.get_words_and_boxes()
 
@@ -1046,7 +1046,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1290,13 +1290,13 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
 
                 model = model_class(config)
 
@@ -1327,11 +1327,11 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1349,7 +1349,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1403,7 +1403,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1593,7 +1593,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -1630,7 +1630,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -1743,7 +1743,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1837,7 +1837,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         self.assertEqual(len(tokens[key].shape), 3)
                         self.assertEqual(tokens[key].shape[-1], 4)
 
-    @unittest.skip("TO DO: overwrite this very extensive test.")
+    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignement_methods(self):
         pass
 
@@ -1875,7 +1875,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
         return words, boxes, output_ids
 
-    # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
+    # @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_pair_input(self):
         tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
         for tokenizer in tokenizers:
@@ -2237,7 +2237,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     self.assertEqual(bbox, bbox_second_sequence)
                     self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow)
 
-    # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
+    # @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_single_input(self):
         tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
         for tokenizer in tokenizers:
@@ -2359,15 +2359,15 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     self.assertEqual(bbox, sequence["bbox"][:-2])
                     self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
 
-    @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("LayoutLMv2 tokenizer always expects pretokenized inputs.")
+    @unittest.skip(reason="LayoutLMv2 tokenizer always expects pretokenized inputs.")
     def test_compare_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("LayoutLMv2 fast tokenizer does not support prepare_for_model")
+    @unittest.skip(reason="LayoutLMv2 fast tokenizer does not support prepare_for_model")
     def test_compare_prepare_for_model(self):
         pass
 
@@ -2476,10 +2476,10 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("Chat is not supported")
+    @unittest.skip(reason="Chat is not supported")
     def test_chat_template(self):
         pass
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 80d29d3a46b..e478e0ac62c 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -140,7 +140,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         output_text = "lower newer"
         return input_text, output_text
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -265,11 +265,11 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
                 self.assertIn(decoded, [output, output.lower()])
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_split_special_tokens(self):
         pass
 
@@ -694,7 +694,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         words, boxes = self.get_words_and_boxes_batch()
 
@@ -715,7 +715,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
 
@@ -822,7 +822,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -883,7 +883,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     words, boxes = self.get_words_and_boxes()
 
@@ -926,7 +926,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1168,13 +1168,13 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
 
                 model = model_class(config)
 
@@ -1205,11 +1205,11 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1227,7 +1227,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1281,7 +1281,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1471,7 +1471,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -1508,7 +1508,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -1621,7 +1621,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1720,7 +1720,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         self.assertEqual(len(tokens[key].shape), 3)
                         self.assertEqual(tokens[key].shape[-1], 4)
 
-    @unittest.skip("TO DO: overwrite this very extensive test.")
+    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignement_methods(self):
         pass
 
@@ -2272,15 +2272,15 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     # self.assertEqual(bbox, sequence["bbox"][:-2])
                     # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :])
 
-    @unittest.skip("LayoutLMv3 tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="LayoutLMv3 tokenizer requires boxes besides sequences.")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("LayoutLMv3 tokenizer always expects pretokenized inputs.")
+    @unittest.skip(reason="LayoutLMv3 tokenizer always expects pretokenized inputs.")
     def test_compare_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("LayoutLMv3 fast tokenizer does not support prepare_for_model")
+    @unittest.skip(reason="LayoutLMv3 fast tokenizer does not support prepare_for_model")
     def test_compare_prepare_for_model(self):
         pass
 
@@ -2393,7 +2393,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
@@ -2408,13 +2408,13 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
 
                 model = model_class(config)
 
@@ -2433,6 +2433,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 model(encoded_sequence)
                 model(batch_encoded_sequence)
 
-    @unittest.skip("Chat is not supported")
+    @unittest.skip(reason="Chat is not supported")
     def test_chat_template(self):
         pass
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index 03f2bf414bd..2f8b19a662a 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -107,7 +107,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         output_text = "unwanted, running"
         return input_text, output_text
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -115,7 +115,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # this tokenizer
     def test_save_sentencepiece_tokenizer(self) -> None:
         if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
         # We want to verify that we will be able to save the tokenizer even if the original files that were used to
         # build the tokenizer have been deleted in the meantime.
         words, boxes = self.get_words_and_boxes()
@@ -745,7 +745,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         words, boxes = self.get_words_and_boxes_batch()
 
@@ -766,7 +766,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
 
@@ -873,7 +873,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -934,7 +934,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     words, boxes = self.get_words_and_boxes()
 
@@ -977,7 +977,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1066,7 +1066,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname)
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
@@ -1224,13 +1224,13 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
 
                 model = model_class(config)
 
@@ -1256,11 +1256,11 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1278,7 +1278,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1332,7 +1332,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1522,7 +1522,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -1559,7 +1559,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -1672,7 +1672,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1770,7 +1770,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_save_pretrained(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
@@ -1838,27 +1838,27 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname2)
 
-    @unittest.skip("TO DO: overwrite this very extensive test.")
+    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignement_methods(self):
         pass
 
-    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
-    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_single_input(self):
         pass
 
-    @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("layoutxlm tokenizer always expects pretokenized inputs.")
+    @unittest.skip(reason="layoutxlm tokenizer always expects pretokenized inputs.")
     def test_compare_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("layoutxlm fast tokenizer does not support prepare_for_model")
+    @unittest.skip(reason="layoutxlm fast tokenizer does not support prepare_for_model")
     def test_compare_prepare_for_model(self):
         pass
 
@@ -1962,18 +1962,18 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("Doesn't use SentencePiece")
+    @unittest.skip(reason="Doesn't use SentencePiece")
     def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
         pass
 
-    @unittest.skip("Doesn't use SentencePiece")
+    @unittest.skip(reason="Doesn't use SentencePiece")
     def test_sentencepiece_tokenize_and_decode(self):
         pass
 
-    @unittest.skip("Chat is not supported")
+    @unittest.skip(reason="Chat is not supported")
     def test_chat_template(self):
         pass
diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py
index 6f5c645855e..2247a64374d 100644
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@@ -378,8 +378,8 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         model.generate(input_ids, attention_mask=attention_mask)
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    @unittest.skip(reason="Longformer cannot keep gradients in attentions or hidden states")
     def test_retain_grad_hidden_states_attentions(self):
-        # longformer cannot keep gradients in attentions or hidden states
         return
 
     def test_attention_outputs(self):
diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py
index f287677a129..7d677bf3f5e 100644
--- a/tests/models/led/test_tokenization_led.py
+++ b/tests/models/led/test_tokenization_led.py
@@ -154,6 +154,7 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
             outputs = tokenizer.pad(encoded_output)
             self.assertSequenceEqual(outputs["global_attention_mask"], expected_global_attention_mask)
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 833e949d6e1..6199d9cdfcf 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -281,7 +281,7 @@ class LevitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     # special case for LevitForImageClassificationWithTeacher model
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -303,7 +303,7 @@ class LevitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config.use_cache = False
         config.return_dict = True
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 3e84552ab7e..0935e802c68 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -393,7 +393,7 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Llama buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Llama buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
@@ -710,7 +710,7 @@ class LlamaIntegrationTest(unittest.TestCase):
         # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
         # work as intended. See https://github.com/pytorch/pytorch/issues/121943
         if version.parse(torch.__version__) < version.parse("2.3.0"):
-            self.skipTest("This test requires torch >= 2.3 to run.")
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
 
         NUM_TOKENS_TO_GENERATE = 40
         # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index a41774e9f5d..e45149672a8 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -26,7 +26,6 @@ from transformers import (
     AddedToken,
     LlamaTokenizer,
     LlamaTokenizerFast,
-    is_torch_available,
 )
 from transformers.convert_slow_tokenizer import convert_slow_tokenizer
 from transformers.testing_utils import (
@@ -45,10 +44,6 @@ from ...test_tokenization_common import TokenizerTesterMixin
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
 
-if is_torch_available():
-    pass
-
-
 @require_sentencepiece
 @require_tokenizers
 class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -144,7 +139,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             ],
         )
 
-    @unittest.skip("Let's wait for the fast tokenizer!")
+    @unittest.skip(reason="Let's wait for the fast tokenizer!")
     def test_save_pretrained(self):
         self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
@@ -213,7 +208,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     @require_torch
     def test_batch_tokenization(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -233,7 +228,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         return_tensors="pt",
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 # max_target_length will default to max_length if not specified
                 batch = tokenizer(text, max_length=3, return_tensors="pt")
@@ -244,7 +239,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
     def test_save_slow_from_fast_and_reload_fast(self):
         pass
 
@@ -299,11 +294,11 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             pickled_tokenizer = pickle.dumps(tokenizer)
         pickle.loads(pickled_tokenizer)
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_pickle_subword_regularization_tokenizer(self):
         pass
 
-    @unittest.skip("worker 'gw4' crashed on CI, passing locally.")
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
     def test_subword_regularization_tokenizer(self):
         pass
 
diff --git a/tests/models/llava_next/test_image_processing_llava_next.py b/tests/models/llava_next/test_image_processing_llava_next.py
index ff5c9e97087..fc399298c39 100644
--- a/tests/models/llava_next/test_image_processing_llava_next.py
+++ b/tests/models/llava_next/test_image_processing_llava_next.py
@@ -197,7 +197,9 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         expected_output_image_shape = (7, 1445, 3, 18, 18)
         self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
 
-    @unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(
+        reason="LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
 
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index 1ae3db4018a..ef133142573 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -384,11 +384,11 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
 
+    @unittest.skip(reason="Longformer cannot keep gradients in attention or hidden states")
     def test_retain_grad_hidden_states_attentions(self):
-        # longformer cannot keep gradients in attentions or hidden states
         return
 
-    @unittest.skip("LongFormer calculates global attn only when attn_mask has non-zero elements")
+    @unittest.skip(reason="LongFormer calculates global attn only when attn_mask has non-zero elements")
     def test_batching_equivalence(self):
         return
 
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index 7ed35f83612..65c42a0cab9 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -166,6 +166,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
         self.assertNotEqual(first_char, space_encoding)
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 226265c5c0f..2b018309467 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -642,7 +642,7 @@ class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="has_attentions is set to False")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -769,7 +769,7 @@ class LongT5TGlobalModelTest(LongT5ModelTest):
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="has_attentions is set to False")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1035,7 +1035,7 @@ class LongT5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="has_attentions is set to False")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1107,7 +1107,7 @@ class LongT5EncoderOnlyTGlobalModelTest(LongT5EncoderOnlyModelTest):
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="has_attentions is set to False")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py
index 0e5d9123156..ae6db98eaf8 100644
--- a/tests/models/luke/test_tokenization_luke.py
+++ b/tests/models/luke/test_tokenization_luke.py
@@ -130,6 +130,7 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
         first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
         self.assertNotEqual(first_char, space_encoding)
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
index a98643b33cd..b019d3ed16d 100644
--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -766,15 +766,15 @@ class LxmertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
         return tf_inputs_dict
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py
index 9500416c276..6f1c5306ff3 100644
--- a/tests/models/lxmert/test_tokenization_lxmert.py
+++ b/tests/models/lxmert/test_tokenization_lxmert.py
@@ -68,7 +68,7 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index ca323ee0da2..a2668e663b6 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -346,6 +346,7 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         model.resize_decoder_token_embeddings(config.vocab_size + 1)
         self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
 
+    @unittest.skip
     def test_tie_word_embeddings_decoder(self):
         pass
 
@@ -367,15 +368,15 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
@@ -899,6 +900,6 @@ class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 5d74f88fe51..b2c0d20bdb2 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -101,7 +101,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
         return questions, nodes, xpaths
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -207,7 +207,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
                 self.assertIn(decoded, [output, output.lower()])
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
@@ -708,7 +708,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -769,7 +769,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     nodes, xpaths = self.get_nodes_and_xpaths()
 
@@ -814,7 +814,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1056,13 +1056,13 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or does not have a pad token set")
 
                 model = model_class(config)
 
@@ -1088,11 +1088,11 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1110,7 +1110,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1172,7 +1172,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1367,7 +1367,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -1406,7 +1406,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -1519,7 +1519,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1613,7 +1613,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         self.assertEqual(len(tokens[key].shape), 3)
                         self.assertEqual(tokens[key].shape[-2], 6)
 
-    @unittest.skip("TO DO: overwrite this very extensive test.")
+    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignement_methods(self):
         pass
 
@@ -2152,15 +2152,15 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     self.assertEqual(xpath_tags_seq, sequence["xpath_tags_seq"][:-2])
                     self.assertEqual(overflowing_xpath_tags_seq, sequence["xpath_tags_seq"][-(2 + stride) :])
 
-    @unittest.skip("MarkupLM tokenizer requires xpaths besides sequences.")
+    @unittest.skip(reason="MarkupLM tokenizer requires xpaths besides sequences.")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("MarkupLM tokenizer always expects pretokenized inputs.")
+    @unittest.skip(reason="MarkupLM tokenizer always expects pretokenized inputs.")
     def test_compare_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("MarkupLM fast tokenizer does not support prepare_for_model")
+    @unittest.skip(reason="MarkupLM fast tokenizer does not support prepare_for_model")
     def test_compare_prepare_for_model(self):
         pass
 
@@ -2264,13 +2264,13 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         nodes, xpaths = self.get_nodes_and_xpaths()
 
@@ -2290,7 +2290,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
         # check correct behaviour if no pad_token_id exists and add it eventually
@@ -2309,10 +2309,10 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             cm.records[0].message,
         )
 
-    @unittest.skip("Chat is not supported")
+    @unittest.skip(reason="Chat is not supported")
     def test_chat_template(self):
         pass
 
-    @unittest.skip("The model tested fails `Hub -> Fast == Hub -> Slow`, nothing much we can do")
+    @unittest.skip(reason="The model tested fails `Hub -> Fast == Hub -> Slow`, nothing much we can do")
     def test_added_tokens_serialization(self):
         pass
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 64e2de13857..ba78cf9ce3f 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -276,7 +276,7 @@ class Mask2FormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         model_class = self.all_model_classes[1]
         config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 5f419b51557..513ac6f67b5 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -213,11 +213,11 @@ class MaskFormerSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_backbone(*config_and_inputs)
 
-    @unittest.skip("Swin does not use inputs_embeds")
+    @unittest.skip(reason="Swin does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("Swin does not support feedforward chunking")
+    @unittest.skip(reason="Swin does not support feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
 
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 5ac25724030..943b3fbf6f4 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -496,7 +496,7 @@ class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
     ]
     tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
 
-    @unittest.skip("This test is broken, still generates english")
+    @unittest.skip(reason="This test is broken, still generates english")
     def test_cc25_generate(self):
         inputs = self.tokenizer([self.src_text[0]], return_tensors="pt").to(torch_device)
         translated_tokens = self.model.generate(
@@ -731,6 +731,6 @@ class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, u
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot retain gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py
index 635be07aa1c..bbe800357e7 100644
--- a/tests/models/mbart/test_tokenization_mbart.py
+++ b/tests/models/mbart/test_tokenization_mbart.py
@@ -134,7 +134,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_save_pretrained(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
@@ -202,7 +202,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname2)
 
-    @unittest.skip("Need to fix this after #26538")
+    @unittest.skip(reason="Need to fix this after #26538")
     def test_training_new_tokenizer(self):
         pass
 
diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py
index 799cd8afc3e..cd86bcf623a 100644
--- a/tests/models/mbart50/test_tokenization_mbart50.py
+++ b/tests/models/mbart50/test_tokenization_mbart50.py
@@ -112,7 +112,7 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_save_pretrained(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index 2ba1b92c21c..ee6bedfd0ca 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -369,7 +369,7 @@ TOLERANCE = 1e-4
 @require_tokenizers
 class MegatronBertModelIntegrationTests(unittest.TestCase):
     @slow
-    @unittest.skip("Model is not available.")
+    @unittest.skip(reason="Model is not available.")
     def test_inference_no_head(self):
         directory = "nvidia/megatron-bert-uncased-345m"
         if "MYDIR" in os.environ:
diff --git a/tests/models/megatron_gpt2/test_modeling_megatron_gpt2.py b/tests/models/megatron_gpt2/test_modeling_megatron_gpt2.py
index cde07f34a86..47b8def921d 100644
--- a/tests/models/megatron_gpt2/test_modeling_megatron_gpt2.py
+++ b/tests/models/megatron_gpt2/test_modeling_megatron_gpt2.py
@@ -31,7 +31,7 @@ if is_torch_available():
 @require_tokenizers
 class MegatronGPT2IntegrationTest(unittest.TestCase):
     @slow
-    @unittest.skip("Model is not available.")
+    @unittest.skip(reason="Model is not available.")
     def test_inference_no_head(self):
         directory = "nvidia/megatron-gpt2-345m/"
         if "MYDIR" in os.environ:
diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py
index 035d43cc438..91ec39f027a 100644
--- a/tests/models/mgp_str/test_tokenization_mgp_str.py
+++ b/tests/models/mgp_str/test_tokenization_mgp_str.py
@@ -51,7 +51,7 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         output_text = "tester"
         return input_text, output_text
 
-    @unittest.skip("MGP-STR always lower cases letters.")
+    @unittest.skip(reason="MGP-STR always lower cases letters.")
     def test_added_tokens_do_lower_case(self):
         pass
 
@@ -86,10 +86,10 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 self.assertEqual(text_2.replace(" ", ""), output_text)
 
-    @unittest.skip("MGP-STR tokenizer only handles one sequence.")
+    @unittest.skip(reason="MGP-STR tokenizer only handles one sequence.")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
-    @unittest.skip("inputs cannot be pretokenized in MgpstrTokenizer")
+    @unittest.skip(reason="inputs cannot be pretokenized in MgpstrTokenizer")
     def test_pretokenized_inputs(self):
         pass
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 3f00e6c9b7a..f2ba612a47b 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -397,11 +397,11 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Mistral buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Mistral uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -489,7 +489,7 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Mistral flash attention does not support right padding")
+        self.skipTest(reason="Mistral flash attention does not support right padding")
 
 
 @require_torch_gpu
@@ -647,10 +647,10 @@ class MistralIntegrationTest(unittest.TestCase):
         # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
         # work as intended. See https://github.com/pytorch/pytorch/issues/121943
         if version.parse(torch.__version__) < version.parse("2.3.0"):
-            self.skipTest("This test requires torch >= 2.3 to run.")
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
 
         if self.cuda_compute_capability_major_version == 7:
-            self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
+            self.skipTest(reason="This test is failing (`torch.compile` fails) on Nvidia T4 GPU.")
 
         NUM_TOKENS_TO_GENERATE = 40
         EXPECTED_TEXT_COMPLETION = {
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 02b203c7aeb..db9641e3dcb 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -398,11 +398,11 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Mixtral buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Mixtral buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Mixtral uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Mixtral uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -490,7 +490,7 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Mixtral flash attention does not support right padding")
+        self.skipTest(reason="Mixtral flash attention does not support right padding")
 
     # Ignore copy
     def test_load_balancing_loss(self):
diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py
index 0497fa849ca..edb62a791c0 100644
--- a/tests/models/mluke/test_tokenization_mluke.py
+++ b/tests/models/mluke/test_tokenization_mluke.py
@@ -93,6 +93,7 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
         ids = tokenizer.encode(txt, add_special_tokens=False)
         return txt, ids
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index e4ebca4b6e5..d7a409427c9 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -298,7 +298,7 @@ class MobileBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         return inputs_dict
 
     # TODO (@SunMarc): Fix me
-    @unittest.skip("It's broken.")
+    @unittest.skip(reason="It's broken.")
     def test_resize_tokens_embeddings(self):
         super().test_resize_tokens_embeddings()
 
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index 0a01e2396af..2a5c250b849 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -87,7 +87,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_rust_and_python_full_tokenizers
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index 10c0c164d16..9f97f3c11b5 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -246,7 +246,7 @@ class MPNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
 
-    @unittest.skip("TFMPNet adds poolers to all models, unlike the PT model class.")
+    @unittest.skip(reason="TFMPNet adds poolers to all models, unlike the PT model class.")
     def test_tf_from_pt_safetensors(self):
         return
 
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index e691214cca8..55919cbbf95 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -422,7 +422,7 @@ class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mpt_weight_initialization(*config_and_inputs)
 
-    @unittest.skip("For backward compatibility the lm_head is not in the model's state dict on the Hub.")
+    @unittest.skip(reason="For backward compatibility the lm_head is not in the model's state dict on the Hub.")
     def test_model_weights_reload_no_missing_tied_weights(self):
         pass
 
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index 1df002a124b..4c839f5da10 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -376,7 +376,9 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip("Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373.")
+    @unittest.skip(
+        reason="Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373."
+    )
     def test_batching_equivalence(self):
         pass
 
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 6964e8b4817..ec6ec6cd85c 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -593,7 +593,7 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            self.skipTest(reason="torch.fx is not available or not compatible with this model")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.return_dict = False
@@ -837,7 +837,7 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         model = MT5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         model = MT5Model(config_and_inputs[0]).to(torch_device)
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index dd1a92cb75d..7fc2f8c9db4 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -205,7 +205,7 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
 
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.use_cache = False
@@ -270,15 +270,15 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    # skip as this model doesn't support all arguments tested
+    @unittest.skip(reason="MusicGen does not support all arguments tested")
     def test_model_outputs_equivalence(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied")
     def test_tie_model_weights(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied")
     def test_tied_weights_keys(self):
         pass
 
@@ -624,6 +624,9 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
     @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         if not self.all_model_classes[0]._supports_sdpa:
             self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
 
@@ -1085,7 +1088,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1262,27 +1265,27 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
             model = model_class(config)
             self.assertTrue(model.is_gradient_checkpointing)
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tie_model_weights(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tied_model_weights_key_ignore(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tied_weights_keys(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
@@ -1569,7 +1572,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
-            return
+            self.skipTest(reason="bos_token_id is None")
 
         for model_class in self.greedy_sample_model_classes:
             model = model_class(config).to(torch_device)
@@ -1615,7 +1618,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
             self.assertNotIn(config.pad_token_id, output_generate)
 
-    @unittest.skip("MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model")
+    @unittest.skip(
+        reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model"
+    )
     def test_save_load_fast_init_from_base(self):
         pass
 
@@ -1934,6 +1939,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     @slow
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         if not self.all_model_classes[0]._supports_sdpa:
             self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
 
diff --git a/tests/models/musicgen/test_processing_musicgen.py b/tests/models/musicgen/test_processing_musicgen.py
index c0157ed0b55..be1e855725d 100644
--- a/tests/models/musicgen/test_processing_musicgen.py
+++ b/tests/models/musicgen/test_processing_musicgen.py
@@ -22,12 +22,9 @@ import numpy as np
 
 from transformers import T5Tokenizer, T5TokenizerFast
 from transformers.testing_utils import require_sentencepiece, require_torch
-from transformers.utils.import_utils import is_speech_available, is_torch_available
+from transformers.utils.import_utils import is_speech_available
 
 
-if is_torch_available():
-    pass
-
 if is_speech_available():
     from transformers import EncodecFeatureExtractor, MusicgenProcessor
 
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 125ca87ac74..7cebf037d27 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -208,7 +208,7 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
     # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.check_training_gradient_checkpointing with Musicgen->MusicgenMelody
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.use_cache = False
@@ -273,15 +273,15 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip("this model doesn't support all arguments tested")
+    @unittest.skip(reason="this model doesn't support all arguments tested")
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip("this model has multiple inputs embeds and lm heads that should not be tied")
+    @unittest.skip(reason="this model has multiple inputs embeds and lm heads that should not be tied")
     def test_tie_model_weights(self):
         pass
 
-    @unittest.skip("this model has multiple inputs embeds and lm heads that should not be tied")
+    @unittest.skip(reason="this model has multiple inputs embeds and lm heads that should not be tied")
     def test_tied_weights_keys(self):
         pass
 
@@ -626,6 +626,9 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
     @slow
     # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_inference
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         if not self.all_model_classes[0]._supports_sdpa:
             self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
 
@@ -1089,7 +1092,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
 
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1247,27 +1250,27 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
             model = model_class(config)
             self.assertTrue(model.is_gradient_checkpointing)
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tie_model_weights(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tied_model_weights_key_ignore(self):
         pass
 
-    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
     def test_tied_weights_keys(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
@@ -1553,7 +1556,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
-            return
+            self.skipTest(reason="bos_token_id is None")
 
         for model_class in self.greedy_sample_model_classes:
             model = model_class(config).to(torch_device)
@@ -1600,7 +1603,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
             self.assertNotIn(config.pad_token_id, output_generate)
 
     @unittest.skip(
-        "MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
+        reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
     )
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index 1348d9ed799..33c6d778448 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -818,6 +818,6 @@ class MvpStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, uni
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index a442848d4d5..9320f8f020d 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -146,6 +146,7 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
             self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
             self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index b0ef147db20..1759731bc68 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -207,7 +207,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     @require_torch
     def test_prepare_seq2seq_batch(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -236,7 +236,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         tgt_lang="ron_Latn",
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError when calling prepare_seq2seq_batch")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 self.assertEqual(batch.labels.shape[1], 10)
                 # max_target_length will default to max_length if not specified
@@ -253,7 +253,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
     def test_save_slow_from_fast_and_reload_fast(self):
         pass
 
@@ -290,7 +290,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     self.assertTrue(special_token_id in p_output)
                     self.assertTrue(special_token_id in cr_output)
 
-    @unittest.skip("Need to fix this after #26538")
+    @unittest.skip(reason="Need to fix this after #26538")
     def test_training_new_tokenizer(self):
         pass
 
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index caa61fc5c0d..a02dbcaf7f9 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -403,7 +403,7 @@ class NllbMoeModelIntegrationTests(unittest.TestCase):
         EXPECTED_LOGTIS = torch.Tensor([-0.3059, 0.0000, 9.3029, 0.6456, -0.9148, 1.7836, 0.6478, 0.9438, -0.5272, -0.6617, -1.2717, 0.4564, 0.1345, -0.2301, -1.0140, 1.1427, -1.5535, 0.1337, 0.2082, -0.8112, -0.3842, -0.3377, 0.1256, 0.6450, -0.0452, 0.0219, 1.4274, -0.4991, -0.2063, -0.4409,])  # fmt: skip
         torch.testing.assert_close(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3)
 
-    @unittest.skip("This requires 300GB of RAM")
+    @unittest.skip(reason="This requires 300GB of RAM")
     def test_large_logits(self):
         model = self.big_model
         with torch.no_grad():
@@ -421,7 +421,7 @@ class NllbMoeModelIntegrationTests(unittest.TestCase):
         torch.testing.assert_close(output.last_hidden_state[1, 0, :30], EXPECTED_DECODER_STATE, rtol=6e-3, atol=9e-3)
         torch.testing.assert_close(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3)
 
-    @unittest.skip("This requires 300GB of RAM")
+    @unittest.skip(reason="This requires 300GB of RAM")
     def test_seq_to_seq_generation(self):
         model = self.big_model
         tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-moe-54b")
diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py
index 088ce56f6e6..38a9e3ba9c0 100644
--- a/tests/models/nougat/test_tokenization_nougat.py
+++ b/tests/models/nougat/test_tokenization_nougat.py
@@ -102,19 +102,19 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     padding="max_length",
                 )
 
-    @unittest.skip("NougatTokenizerFast does not have tokenizer_file in its signature")
+    @unittest.skip(reason="NougatTokenizerFast does not have tokenizer_file in its signature")
     def test_rust_tokenizer_signature(self):
         pass
 
-    @unittest.skip("NougatTokenizerFast does not support pretokenized inputs")
+    @unittest.skip(reason="NougatTokenizerFast does not support pretokenized inputs")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("NougatTokenizerFast directly inherits from PreTrainedTokenizerFast")
+    @unittest.skip(reason="NougatTokenizerFast directly inherits from PreTrainedTokenizerFast")
     def test_prepare_for_model(self):
         pass
 
-    @unittest.skip("This needs a slow tokenizer. Nougat does not have one!")
+    @unittest.skip(reason="This needs a slow tokenizer. Nougat does not have one!")
     def test_encode_decode_with_spaces(self):
         pass
 
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index ee87521c5ba..b74d0fdf03b 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -301,7 +301,7 @@ class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("OLMo does not support head pruning.")
+    @unittest.skip(reason="OLMo does not support head pruning.")
     def test_headmasking(self):
         pass
 
@@ -311,7 +311,7 @@ class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("OLMo buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="OLMo buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
index a9dcdc2cfb9..853bf241dd9 100644
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -213,6 +213,7 @@ class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
 
         return inputs
 
+    @unittest.skip
     def test_init_without_params(self):
         pass
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 429920ac99e..ac8f044c556 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -377,7 +377,7 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
         # only OneFormerForUniversalSegmentation has the loss
         model_class = self.all_model_classes[1]
         (
diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py
index e86341ef0ee..3a8a378b490 100644
--- a/tests/models/oneformer/test_processor_oneformer.py
+++ b/tests/models/oneformer/test_processor_oneformer.py
@@ -209,6 +209,7 @@ class OneFormerProcessingTest(unittest.TestCase):
         self.assertTrue(hasattr(processor, "max_seq_length"))
         self.assertTrue(hasattr(processor, "task_seq_length"))
 
+    @unittest.skip
     def test_batch_feature(self):
         pass
 
@@ -397,6 +398,7 @@ class OneFormerProcessingTest(unittest.TestCase):
 
         return inputs
 
+    @unittest.skip
     def test_init_without_params(self):
         pass
 
diff --git a/tests/models/openai/test_tokenization_openai.py b/tests/models/openai/test_tokenization_openai.py
index 1f5ef5a35b3..5c8a76a5ae4 100644
--- a/tests/models/openai/test_tokenization_openai.py
+++ b/tests/models/openai/test_tokenization_openai.py
@@ -131,7 +131,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     padding="max_length",
                 )
 
-    # tokenizer has no padding token
+    @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
 
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index e9843058f66..83721f1281f 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -322,7 +322,7 @@ class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
diff --git a/tests/models/owlv2/test_image_processing_owlv2.py b/tests/models/owlv2/test_image_processing_owlv2.py
index 51814b6dd80..6eda53a971b 100644
--- a/tests/models/owlv2/test_image_processing_owlv2.py
+++ b/tests/models/owlv2/test_image_processing_owlv2.py
@@ -168,6 +168,6 @@ class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
                 f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
             )
 
-    @unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    @unittest.skip(reason="OWLv2 doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index c25a162dc81..48070c7bb86 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -494,7 +494,7 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -708,7 +708,7 @@ class Owlv2ForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 28b149bf09a..a08fae0bc6d 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -489,7 +489,7 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -701,7 +701,7 @@ class OwlViTForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index fbd460ce794..7753ae073dd 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -234,7 +234,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
         pass
 
     # TODO extend valid outputs to include this test @Molbap
-    @unittest.skip("PaliGemma has currently one output format.")
+    @unittest.skip(reason="PaliGemma has currently one output format.")
     def test_model_outputs_equivalence(self):
         pass
 
diff --git a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
index 13b0476e203..aae75b8586a 100644
--- a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
+++ b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
@@ -317,7 +317,7 @@ class PatchTSMixerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         for model_class in self.all_model_classes:
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("No tokens embeddings")
+    @unittest.skip(reason="No tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index 0df3aaced5e..f7de1258847 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -592,6 +592,6 @@ class PegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index 66a68a97fc7..bb52b8c4731 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -128,10 +128,6 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             revision="ba85d0851d708441f91440d509690f1ab6353415",
         )
 
-    # @unittest.skip("We have to use from_slow")
-    # def test_added_tokens_serialization(self):
-    #     pass
-
 
 @require_sentencepiece
 @require_tokenizers
@@ -215,7 +211,3 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             token_ids,
             [182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
         )
-
-    # @unittest.skip("We have to use from_slow")
-    # def test_added_tokens_serialization(self):
-    #     pass
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index e5a6b8ebb8f..c6b4b2c8648 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -872,6 +872,6 @@ class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index bbe9d0a43c3..b0ec8ac45c3 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -387,7 +387,7 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes:
             if model_class.__name__ in [
@@ -732,7 +732,7 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     def test_correct_missing_keys(self):
         if not self.test_missing_keys:
-            return
+            self.skipTest(reason="test_missing_keys is set to False")
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index 306f594d108..de9bf36b434 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -270,15 +270,16 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.perceiver_tokenizer
         self.assertEqual(tokenizer.decode([178]), "�")
 
-    # tokenizer does not have vocabulary
+    @unittest.skip(reason="tokenizer does not have vocabulary")
     def test_get_vocab(self):
         pass
 
-    # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
+    @unittest.skip(reason="inputs cannot be pretokenized")
     def test_pretokenized_inputs(self):
+        # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
         pass
 
-    # tests all ids in vocab => vocab doesn't exist so unnecessary to test
+    @unittest.skip(reason="vocab does not exist")
     def test_conversion_reversible(self):
         pass
 
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 518cb7e0379..490ceb8141c 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -384,7 +384,7 @@ class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Persimmon buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Persimmon buffers include complex numbers, which breaks this test")
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_save_load_fast_init_from_base
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/pix2struct/test_image_processing_pix2struct.py b/tests/models/pix2struct/test_image_processing_pix2struct.py
index fee18814f0f..2d5616b5b78 100644
--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -335,14 +335,16 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
                 (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
             )
 
-    @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_numpy(self):
         return super().test_call_numpy()
 
-    @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
     def test_call_pytorch(self):
         return super().test_call_torch()
 
-    @unittest.skip("Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently")  # FIXME Amy
+    @unittest.skip(
+        reason="Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently"
+    )  # FIXME Amy
     def test_call_numpy_4_channels(self):
         return super().test_call_torch()
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index f92512d4d5b..2d762008cbb 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -489,7 +489,7 @@ class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -508,7 +508,7 @@ class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         for model_class in self.all_model_classes[:-1]:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -555,7 +555,7 @@ class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_resize_tokens_embeddings(self):
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -602,13 +602,13 @@ class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_resize_embeddings_untied(self):
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untie embeddings")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -652,7 +652,7 @@ class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 778f7fe4fdf..9c16214a1c1 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -319,7 +319,7 @@ class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         model.generate(input_ids, attention_mask=attention_mask)
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
-    @unittest.skip("Failing since #26752")
+    @unittest.skip(reason="Failing since #26752")
     def test_sample_generate(self):
         pass
 
@@ -664,6 +664,6 @@ class PLBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index fafae6ae250..d9a522cde6f 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -144,11 +144,11 @@ class PoolFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("PoolFormer does not use inputs_embeds")
+    @unittest.skip(reason="PoolFormer does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("PoolFormer does not have get_input_embeddings method and get_output_embeddings methods")
+    @unittest.skip(reason="PoolFormer does not have get_input_embeddings method and get_output_embeddings methods")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -190,7 +190,7 @@ class PoolFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py
index 263006c670f..5a4652ad577 100644
--- a/tests/models/pop2piano/test_feature_extraction_pop2piano.py
+++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py
@@ -280,14 +280,14 @@ class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittes
     def test_padding_from_array(self):
         pass
 
-    @unittest.skip("Pop2PianoFeatureExtractor does not support truncation")
+    @unittest.skip(reason="Pop2PianoFeatureExtractor does not support truncation")
     def test_attention_mask_with_truncation(self):
         pass
 
-    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
+    @unittest.skip(reason="Pop2PianoFeatureExtractor does not supports truncation")
     def test_truncation_from_array(self):
         pass
 
-    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
+    @unittest.skip(reason="Pop2PianoFeatureExtractor does not supports truncation")
     def test_truncation_from_list(self):
         pass
diff --git a/tests/models/pop2piano/test_tokenization_pop2piano.py b/tests/models/pop2piano/test_tokenization_pop2piano.py
index 6c5301b8f3b..29e8eacf26f 100644
--- a/tests/models/pop2piano/test_tokenization_pop2piano.py
+++ b/tests/models/pop2piano/test_tokenization_pop2piano.py
@@ -372,7 +372,7 @@ class Pop2PianoTokenizerTest(unittest.TestCase):
         notes = self.get_input_notes()
 
         if self.tokenizer.pad_token is None:
-            self.skipTest("No padding token.")
+            self.skipTest(reason="No padding token.")
         else:
             normal_tokens = self.tokenizer(notes[0], padding=True, pad_to_multiple_of=8)
             for key, value in normal_tokens.items():
@@ -400,9 +400,9 @@ class Pop2PianoTokenizerTest(unittest.TestCase):
 
     def test_padding_with_attention_mask(self):
         if self.tokenizer.pad_token is None:
-            self.skipTest("No padding token.")
+            self.skipTest(reason="No padding token.")
         if "attention_mask" not in self.tokenizer.model_input_names:
-            self.skipTest("This model does not use attention mask.")
+            self.skipTest(reason="This model does not use attention mask.")
 
         features = [
             {"token_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index b458aadf66c..99329437239 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -947,7 +947,7 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
 
-    @unittest.skip("Flaky test with no simple resolution. TODO Fix me @patrickvonplaten")
+    @unittest.skip(reason="Flaky test with no simple resolution. TODO Fix me @patrickvonplaten")
     def test_decoder_model_generate(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_generate_with_past_key_value_states(*config_and_inputs)
@@ -1112,8 +1112,8 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         self.assertIsNotNone(encoder_hidden_states.grad)
         self.assertIsNotNone(encoder_attentions.grad)
 
+    @unittest.skip(reason="Generating with head_masking has not been implemented for ProphetNet models yet.")
     def test_generate_with_head_masking(self):
-        """Generating with head_masking has not been implemented for ProphetNet models yet."""
         pass
 
 
@@ -1141,8 +1141,8 @@ class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMix
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
 
 
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index de172258cae..e5f5fd0c143 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -178,11 +178,11 @@ class PvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("Pvt does not use inputs_embeds")
+    @unittest.skip(reason="Pvt does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("Pvt does not have get_input_embeddings method and get_output_embeddings methods")
+    @unittest.skip(reason="Pvt does not have get_input_embeddings method and get_output_embeddings methods")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -235,7 +235,7 @@ class PvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py
index 96d4997489c..334e890e7a8 100644
--- a/tests/models/pvt_v2/test_modeling_pvt_v2.py
+++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py
@@ -214,11 +214,11 @@ class PvtV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("Pvt-V2 does not use inputs_embeds")
+    @unittest.skip(reason="Pvt-V2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("Pvt-V2 does not have get_input_embeddings method and get_output_embeddings methods")
+    @unittest.skip(reason="Pvt-V2 does not have get_input_embeddings method and get_output_embeddings methods")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -282,7 +282,7 @@ class PvtV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index f2e66953a7d..fcb7278cd59 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -408,11 +408,11 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Qwen2 buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Qwen2 buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Qwen2 uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Qwen2 uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -500,7 +500,7 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Qwen2 flash attention does not support right padding")
+        self.skipTest(reason="Qwen2 flash attention does not support right padding")
 
 
 @require_torch
diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py
index fba44c6dc81..b188fd2f8c4 100644
--- a/tests/models/qwen2/test_tokenization_qwen2.py
+++ b/tests/models/qwen2/test_tokenization_qwen2.py
@@ -136,14 +136,14 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 270, 267]
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @unittest.skip("We disable the test of pretokenization as it is not reversible.")
+    @unittest.skip(reason="We disable the test of pretokenization as it is not reversible.")
     def test_pretokenized_inputs(self):
         # the test case in parent class uses str.split to "pretokenize",
         # which eats the whitespaces, which, in turn, is not reversible.
         # the results, by nature, should be different.
         pass
 
-    @unittest.skip("We disable the test of clean up tokenization spaces as it is not applicable.")
+    @unittest.skip(reason="We disable the test of clean up tokenization spaces as it is not applicable.")
     def test_clean_up_tokenization_spaces(self):
         # it only tests bert-base-uncased and clean_up_tokenization_spaces is not applicable to this tokenizer
         pass
@@ -169,7 +169,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_slow_tokenizer_token_with_number_sign(self):
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         sequence = " ###"
         token_ids = [268, 269]
@@ -180,7 +180,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self):
         # Qwen2Tokenizer changes the default `spaces_between_special_tokens` in `decode` to False
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         # tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens`
         # special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens`
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 13af5b802bd..36f1db4693a 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -433,11 +433,11 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Qwen2Moe buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Qwen2Moe buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Qwen2Moe uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Qwen2Moe uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -525,7 +525,7 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Qwen2Moe flash attention does not support right padding")
+        self.skipTest(reason="Qwen2Moe flash attention does not support right padding")
 
     # Ignore copy
     def test_load_balancing_loss(self):
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index 69a321636ea..392ff40d770 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -91,7 +91,7 @@ def require_retrieval(test_case):
 
     """
     if not (is_torch_available() and is_datasets_available() and is_faiss_available()):
-        test_case = unittest.skip("test requires PyTorch, datasets and faiss")(test_case)
+        test_case = unittest.skip(reason="test requires PyTorch, datasets and faiss")(test_case)
     return test_case
 
 
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index b0475b244f3..5a391ee6041 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -329,64 +329,65 @@ class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip(reason="Fast init from base not tested for RecurrentGemma")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("RecurrentGemma does not return pkv")
+    @unittest.skip(reason="RecurrentGemma does not return pkv")
     def test_past_key_values_format(self):
         pass
 
-    @unittest.skip("RecurrentGemma only supports sdpa")
+    @unittest.skip(reason="RecurrentGemma only supports sdpa")
     def test_eager_matches_sdpa_generate(self):
         pass
 
-    @unittest.skip("RecurrentGemma only supports sdpa")
+    @unittest.skip(reason="RecurrentGemma only supports sdpa")
     def test_eager_matches_sdpa_inference(self):
         pass
 
-    @unittest.skip("RecurrentGemma does not return the cache")
+    @unittest.skip(reason="RecurrentGemma does not return the cache")
     def test_contrastive_generate_low_memory(self):
         pass
 
-    @unittest.skip("RecurrentGemma does not return the cache")
+    @unittest.skip(reason="RecurrentGemma does not return the cache")
     def test_contrastive_generate_dict_outputs_use_cache(self):
         pass
 
-    @unittest.skip("RecurrentGemma does not return the cache")
+    @unittest.skip(reason="RecurrentGemma does not return the cache")
     def test_contrastive_generate(self):
         pass
 
-    @unittest.skip("SQRBound is known to have issues with gc")
+    @unittest.skip(reason="SQRBound is known to have issues with gc")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
     def _check_attentions_for_generate(self, *args, **kwargs):
         return True  # Model does not return attention
 
-    @unittest.skip("Past key values are not returned")
+    @unittest.skip(reason="Past key values are not returned")
     def test_prompt_lookup_decoding_matches_greedy_search(self):
         pass
 
-    @unittest.skip("Past key values are not returned")
+    @unittest.skip(reason="Past key values are not returned")
     def test_model_parallelism(self):
         pass
 
-    @unittest.skip("Past key values are not returned")
+    @unittest.skip(reason="Past key values are not returned")
     def test_model_parallel_beam_search(self):
         pass
 
     def _check_past_key_values_for_generate(self, *args, **kwargs):
         return True
 
-    @unittest.skip("Rely on `past_key_values` to crop the assistant pkv. Not supported")
+    @unittest.skip(reason="Rely on `past_key_values` to crop the assistant pkv. Not supported")
     def test_assisted_decoding_matches_greedy_search(self):
         pass
 
-    @unittest.skip("RecurrentGemma's output different if you pad left or right. This is expected")
+    @unittest.skip(reason="RecurrentGemma's output different if you pad left or right. This is expected")
     def test_left_padding_compatibility(self):
         pass
 
-    @unittest.skip("Relies on `past_key_values` returned by the model. Not supported with recurrent gemma")
+    @unittest.skip(reason="Relies on `past_key_values` returned by the model. Not supported with recurrent gemma")
     def test_assisted_decoding_sample(self):
         pass
 
@@ -409,7 +410,7 @@ class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    @unittest.skip("TODO @arthurzucker not super important and failing.")
+    @unittest.skip(reason="TODO @arthurzucker not super important and failing.")
     def test_initialization(self):
         pass
 
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 3a33a682d18..152c4f2ba33 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -208,9 +208,6 @@ class ReformerModelTester:
         )
 
     def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels):
-        if not self.is_training:
-            return
-
         config.is_decoder = False
         config.lsh_num_chunks_after = 1
         model = ReformerForMaskedLM(config=config)
@@ -328,9 +325,6 @@ class ReformerModelTester:
         )
 
     def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels):
-        if not self.is_training:
-            return
-
         # disable dropout
         config.hidden_dropout_prob = 0
         config.local_attention_probs_dropout_prob = 0
@@ -517,6 +511,8 @@ class ReformerTesterMixin:
         self.model_tester.create_and_check_reformer_model(*config_and_inputs)
 
     def test_reformer_lm_model_backward(self):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="model_tester.is_training is set to False")
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_reformer_model_with_lm_backward(*config_and_inputs)
 
@@ -539,6 +535,8 @@ class ReformerTesterMixin:
         self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=False)
 
     def test_reformer_chunking_backward_equality(self):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="model_tester.is_training is set to False")
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_reformer_feed_backward_chunking(*config_and_inputs)
 
@@ -587,12 +585,12 @@ class ReformerTesterMixin:
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_reformer_for_sequence_classification(*config_and_inputs, is_decoder=False)
 
+    @unittest.skip(reason="Reformer cannot keep gradients in attentions or hidden states")
     def test_retain_grad_hidden_states_attentions(self):
-        # reformer cannot keep gradients in attentions or hidden states
         return
 
+    @unittest.skip(reason="Reformer cannot resize embeddings that easily")
     def test_resize_embeddings_untied(self):
-        # reformer cannot resize embeddings that easily
         return
 
 
@@ -682,7 +680,7 @@ class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, Mod
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
     def test_left_padding_compatibility(self):
         pass
 
@@ -847,15 +845,15 @@ class ReformerLSHAttnModelTest(
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    @unittest.skip("Fails because the sequence length is not a multiple of 4")
+    @unittest.skip(reason="Fails because the sequence length is not a multiple of 4")
     def test_problem_types(self):
         pass
 
-    @unittest.skip("Fails because the sequence length is not a multiple of 4")
+    @unittest.skip(reason="Fails because the sequence length is not a multiple of 4")
     def test_past_key_values_format(self):
         pass
 
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
     def test_left_padding_compatibility(self):
         pass
 
diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py
index ee9bd52f435..89f3ef7c4ec 100644
--- a/tests/models/reformer/test_tokenization_reformer.py
+++ b/tests/models/reformer/test_tokenization_reformer.py
@@ -61,7 +61,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -125,7 +125,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     padding="max_length",
                 )
 
-    # tokenizer has no padding token
+    @unittest.skip(reason="Tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
 
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 83f444d1629..84fde55e0ae 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -164,6 +164,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
         self.assertNotEqual(first_char, space_encoding)
 
+    @unittest.skip
     def test_pretokenized_inputs(self):
         pass
 
diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py
index c1096081070..2c5b9c65e96 100644
--- a/tests/models/roformer/test_tokenization_roformer.py
+++ b/tests/models/roformer/test_tokenization_roformer.py
@@ -65,11 +65,11 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100]
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens)
 
-    # can't train new_tokenizer via Tokenizers lib
+    @unittest.skip(reason="Cannot train new tokenizer via Tokenizers lib")
     def test_training_new_tokenizer(self):
         pass
 
-    # can't train new_tokenizer via Tokenizers lib
+    @unittest.skip(reason="Cannot train new tokenizer via Tokenizers lib")
     def test_training_new_tokenizer_with_special_tokens_change(self):
         pass
 
diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
index 7ccc04bc54c..efb8d87cac1 100644
--- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
@@ -141,7 +141,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             ],
         )
 
-    @unittest.skip("This fails currently and is a blocker. No idea why TODO @ylacombe")
+    @unittest.skip(reason="This fails currently and is a blocker. No idea why TODO @ylacombe")
     def test_maximum_encoding_length_single_input(self):
         tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
         for tokenizer in tokenizers:
@@ -244,7 +244,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     self.assertEqual(len(overflowing_tokens), 2 + stride)
                     self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
 
-    @unittest.skip("By defaults, uses pad_to_multiple_of which breaks the test")
+    @unittest.skip(reason="By defaults, uses pad_to_multiple_of which breaks the test")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
@@ -253,7 +253,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
                     normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
@@ -286,7 +286,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     @require_torch
     def test_prepare_seq2seq_batch(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -316,7 +316,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         pad_to_multiple_of=None,
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError when calling prepare_seq2seq_batch")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 self.assertEqual(batch.labels.shape[1], 10)
 
@@ -343,7 +343,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 4)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.")
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
     def test_save_slow_from_fast_and_reload_fast(self):
         pass
 
@@ -390,7 +390,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -425,11 +425,11 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
         self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
 
-    @unittest.skip("Fails because of the hack of adding <unk> in _tokenize")
+    @unittest.skip(reason="Fails because of the hack of adding <unk> in _tokenize")
     def test_pickle_subword_regularization_tokenizer(self):
         pass
 
-    @unittest.skip("Fails because of the hack of adding <unk> in _tokenize")
+    @unittest.skip(reason="Fails because of the hack of adding <unk> in _tokenize")
     def test_subword_regularization_tokenizer(self):
         pass
 
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 620675d39e2..9b5e04a5d02 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -200,11 +200,11 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
 
-    @unittest.skip("SegFormer does not use inputs_embeds")
+    @unittest.skip(reason="SegFormer does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
+    @unittest.skip(reason="SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -315,7 +315,7 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index ebe0ef30a38..fe10d994450 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -336,34 +336,31 @@ class SEWModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Sew has no inputs_embeds.")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Sew has input_values instead of input_ids.")
     def test_forward_signature(self):
         pass
 
-    # SEW cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Sew has no token embeddings.")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # SEW has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Sew has no inputs_embeds.")
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index c7b6e30b189..9fd94fbfef2 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -358,22 +358,19 @@ class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Hubert has no inputs_embeds
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # SEW cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # SEW has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -459,15 +456,15 @@ class SEWDModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
diff --git a/tests/models/siglip/test_image_processing_siglip.py b/tests/models/siglip/test_image_processing_siglip.py
index 7dbd05070c6..77a0432d866 100644
--- a/tests/models/siglip/test_image_processing_siglip.py
+++ b/tests/models/siglip/test_image_processing_siglip.py
@@ -120,7 +120,7 @@ class SiglipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
         )
         self.assertEqual(image_processor.size, {"height": 84, "width": 84})
 
-    @unittest.skip("not supported")
+    @unittest.skip(reason="not supported")
     # Ignore copy
     def test_call_numpy_4_channels(self):
         pass
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index af5d0bf2bc3..a8e1bb7b0f1 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -335,10 +335,12 @@ class SiglipTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training
     def test_training(self):
         pass
 
+    @unittest.skip
     # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_training_gradient_checkpointing
     def test_training_gradient_checkpointing(self):
         pass
@@ -502,7 +504,7 @@ class SiglipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->Siglip
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py
index 839c0c32002..b44451f6f20 100644
--- a/tests/models/siglip/test_tokenization_siglip.py
+++ b/tests/models/siglip/test_tokenization_siglip.py
@@ -142,7 +142,7 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -317,7 +317,7 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
         """Test ``_tokenize`` and ``convert_tokens_to_string``."""
         if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")
 
         tokenizer = self.get_tokenizer()
         text = "This is text to test the tokenizer."
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index d7b85e7b48b..b193cacfb40 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -569,15 +569,15 @@ class Speech2TextBertModelTest(EncoderDecoderMixin, unittest.TestCase):
             "labels": decoder_token_labels,
         }
 
-    # can't save full model for now because Speech2TextModel != Speech2TextEncoder
+    @unittest.skip(reason="Cannot save full model as Speech2TextModel != Speech2TextEncoder")
     def test_encoder_decoder_model_from_pretrained_configs(self):
         pass
 
-    # can't save full model for now because Speech2TextModel != Speech2TextEncoder
+    @unittest.skip(reason="Cannot save full model as Speech2TextModel != Speech2TextEncoder")
     def test_save_and_load_from_pretrained(self):
         pass
 
     @require_deterministic_for_xpu
-    # all published pretrained models are Speech2TextModel != Speech2TextEncoder
+    @unittest.skip(reason="Cannot save full model as Speech2TextModel != Speech2TextEncoder")
     def test_real_model_save_load_from_pretrained(self):
         pass
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index ddd83974e82..44672f1c588 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -326,14 +326,15 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # not implemented currently
+    @unittest.skip(reason="Not implemented currently")
     def test_inputs_embeds(self):
         pass
 
-    # training is not supported yet
+    @unittest.skip(reason="Training is not supported yet")
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -536,7 +537,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -584,13 +585,13 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untie embeddings")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -627,6 +628,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
+    @unittest.skip
     def test_generate_without_input_ids(self):
         pass
 
@@ -695,7 +697,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -773,7 +775,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
         # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
         super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
 
-    @unittest.skip("Test failing,  @RocketNight is looking into it")
+    @unittest.skip(reason="Test failing,  @RocketNight is looking into it")
     def test_tf_from_pt_safetensors(self):
         pass
 
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 06afc90c2cc..1d67bb4f8ab 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -212,31 +212,31 @@ class SpeechT5ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
             )
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-    # this model has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # this model has no input embeddings
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         pass
 
     @slow
+    @unittest.skip(reason="Model does not have decoder_input_ids")
     def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model does not have decoder_input_ids")
     def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model does not have decoder_input_ids")
     def test_torchscript_simple(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
 
@@ -598,19 +598,20 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
                         )
 
     # this model has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
     def test_resize_embeddings_untied(self):
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untie embeddings")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -650,7 +651,7 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     def test_resize_tokens_embeddings(self):
         original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -692,14 +693,16 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
 
             self.assertTrue(models_equal)
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         pass
 
-    # training is not supported yet
+    @unittest.skip(reason="Training is not supported yet")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="Training is not supported yet")
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -928,15 +931,15 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
             (self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.num_mel_bins),
         )
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_decoder_model_past_with_large_inputs(self):
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_determinism(self):
         pass
 
-    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
+    @unittest.skip(reason="skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
     def test_batching_equivalence(self):
         pass
 
@@ -985,41 +988,43 @@ class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    # this model has no inputs_embeds
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_model_outputs_equivalence(self):
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_save_load(self):
         pass
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_simple(self):
         # disabled because this model doesn't have decoder_input_ids
         pass
 
-    # training is not supported yet
+    @unittest.skip(reason="training is not supported yet")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="training is not supported yet")
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -1472,15 +1477,15 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
             (self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.num_mel_bins),
         )
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="There is always dropout in SpeechT5SpeechDecoderPrenet")
     def test_decoder_model_past_with_large_inputs(self):
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="There is always dropout in SpeechT5SpeechDecoderPrenet")
     def test_determinism(self):
         pass
 
-    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
+    @unittest.skip(reason="skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
     def test_batching_equivalence(self):
         pass
 
@@ -1685,45 +1690,46 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    # this model has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # this model has no input embeddings
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_model_outputs_equivalence(self):
         pass
 
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         pass
 
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
+    @unittest.skip(reason="Dropout is always present in SpeechT5SpeechDecoderPrenet")
     def test_save_load(self):
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
     @slow
+    @unittest.skip(reason="Model doesn't have decoder_input_ids")
     def test_torchscript_simple(self):
-        # disabled because this model doesn't have decoder_input_ids
         pass
 
-    # training is not supported yet
+    @unittest.skip(reason="Training is not supported yet")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="Training is not supported yet")
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -1873,35 +1879,35 @@ class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
             ]
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-    # this model does not output hidden states
+    @unittest.skip(reason="Model does not output hidden states")
     def test_hidden_states_output(self):
         pass
 
-    # skip
+    @unittest.skip
     def test_initialization(self):
         pass
 
-    # this model has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # this model has no input embeddings
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
-    # skip as this model doesn't support all arguments tested
+    @unittest.skip(reason="Model does not support all arguments tested")
     def test_model_outputs_equivalence(self):
         pass
 
-    # this model does not output hidden states
+    @unittest.skip(reason="Model does not output hidden states")
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    # skip because it fails on automapping of SpeechT5HifiGanConfig
+    @unittest.skip(reason="Fails on automapping of SpeechT5HifiGanConfig")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # skip because it fails on automapping of SpeechT5HifiGanConfig
+    @unittest.skip(reason="Fails on automapping of SpeechT5HifiGanConfig")
     def test_save_load_fast_init_to_base(self):
         pass
 
diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py
index d007b14dd22..8b53031f524 100644
--- a/tests/models/speecht5/test_tokenization_speecht5.py
+++ b/tests/models/speecht5/test_tokenization_speecht5.py
@@ -143,9 +143,11 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(tokens[0], tokenizer.eos_token_id)
                 self.assertEqual(tokens[-3], tokenizer.pad_token_id)
 
+    @unittest.skip
     def test_pickle_subword_regularization_tokenizer(self):
         pass
 
+    @unittest.skip
     def test_subword_regularization_tokenizer(self):
         pass
 
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index 7b05e5c165f..edbc1bce639 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -389,11 +389,11 @@ class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip("Starcoder2 buffers include complex numbers, which breaks this test")
+    @unittest.skip(reason="Starcoder2 buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip("Starcoder2 uses GQA on all models so the KV cache is a non standard format")
+    @unittest.skip(reason="Starcoder2 uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
 
@@ -481,7 +481,7 @@ class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Starcoder2 flash attention does not support right padding")
+        self.skipTest(reason="Starcoder2 flash attention does not support right padding")
 
 
 @slow
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index f045acae162..71b852df6ec 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -672,7 +672,7 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
         model = SwitchTransformersModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         model = SwitchTransformersModel(config_and_inputs[0]).to(torch_device)
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index d269693d240..93634ef2a67 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -596,7 +596,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            self.skipTest(reason="torch.fx is not available or not compatible with this model")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.return_dict = False
@@ -840,7 +840,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         model = T5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         model = T5Model(config_and_inputs[0]).to(torch_device)
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index 82d349e39cc..e64882b6d3f 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -153,7 +153,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index beb5119faa4..99d80b39e92 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -277,8 +277,8 @@ class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
         pass
 
     @slow
+    @unittest.skip(reason="TODO Niels: fix me!")
     def test_model_outputs_equivalence(self):
-        # TODO Niels: fix me!
         pass
 
     def test_attention_outputs(self):
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 7918cad2b98..d4ca5e82e4c 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -520,11 +520,11 @@ class TapasModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
 
     @require_tensorflow_probability
-    @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
+    @unittest.skip(reason="tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
     def test_pt_tf_model_equivalence(self):
         pass
 
-    @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
+    @unittest.skip(reason="tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
     def test_tf_from_pt_safetensors(self):
         pass
 
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index b64eec06d00..8fe65438d5c 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -158,13 +158,13 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or does not have a pad token id set")
 
                 model = model_class(config)
 
@@ -184,7 +184,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -223,7 +223,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         rust_ids = rust_tokenizer.encode(sequence)
         self.assertListEqual(ids, rust_ids)
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -633,11 +633,11 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     sequences, mask = information["input_ids"], information["token_type_ids"]
                     self.assertEqual(len(sequences), len(mask))
 
-    @unittest.skip("TAPAS tokenizer only handles two sequences.")
+    @unittest.skip(reason="TAPAS tokenizer only handles two sequences.")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
-    @unittest.skip("TAPAS tokenizer only handles two sequences.")
+    @unittest.skip(reason="TAPAS tokenizer only handles two sequences.")
     def test_maximum_encoding_length_single_input(self):
         pass
 
@@ -779,7 +779,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -846,7 +846,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 table = self.get_table(tokenizer, length=0)
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
                     normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
@@ -864,7 +864,9 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     for key, value in normal_tokens.items():
                         self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
 
-    @unittest.skip("TAPAS cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
+    @unittest.skip(
+        reason="TAPAS cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`"
+    )
     def test_prepare_for_model(self):
         pass
 
@@ -948,7 +950,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname)
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
@@ -1051,13 +1053,13 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no padding token set.")
 
                 model = model_class(config)
 
@@ -1081,7 +1083,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                     model(**encoded_sequence)
                     model(**batch_encoded_sequence)
 
-    @unittest.skip("TAPAS doesn't handle pre-tokenized inputs.")
+    @unittest.skip(reason="TAPAS doesn't handle pre-tokenized inputs.")
     def test_pretokenized_inputs(self):
         pass
 
@@ -1268,10 +1270,10 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(column_ids.tolist(), expected_results["column_ids"])
         self.assertListEqual(row_ids.tolist(), expected_results["row_ids"])
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("Chat is not supported")
+    @unittest.skip(reason="Chat is not supported")
     def test_chat_template(self):
         pass
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 074d5f1a81c..5cd76b91612 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -214,7 +214,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unit
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # Ignore since we have no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 060f49c2e94..3eaed42efb4 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -217,7 +217,7 @@ class TimesformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="Model has no attentions")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index d5312747a2c..43c511e1efb 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -138,75 +138,75 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
         self.assertEqual(len(timm_model.out_features), len(transformers_model.out_features))
         self.assertEqual(timm_model.channels, transformers_model.channels)
 
-    @unittest.skip("TimmBackbone doesn't support feed forward chunking")
+    @unittest.skip(reason="TimmBackbone doesn't support feed forward chunking")
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip("TimmBackbone doesn't have num_hidden_layers attribute")
+    @unittest.skip(reason="TimmBackbone doesn't have num_hidden_layers attribute")
     def test_hidden_states_output(self):
         pass
 
-    @unittest.skip("TimmBackbone initialization is managed on the timm side")
+    @unittest.skip(reason="TimmBackbone initialization is managed on the timm side")
     def test_initialization(self):
         pass
 
-    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    @unittest.skip(reason="TimmBackbone models doesn't have inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    @unittest.skip(reason="TimmBackbone models doesn't have inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip("TimmBackbone model cannot be created without specifying a backbone checkpoint")
+    @unittest.skip(reason="TimmBackbone model cannot be created without specifying a backbone checkpoint")
     def test_from_pretrained_no_checkpoint(self):
         pass
 
-    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    @unittest.skip(reason="Only checkpoints on timm can be loaded into TimmBackbone")
     def test_save_load(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         pass
 
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
+    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
-    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    @unittest.skip(reason="model weights aren't tied in TimmBackbone.")
     def test_tie_model_weights(self):
         pass
 
-    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    @unittest.skip(reason="model weights aren't tied in TimmBackbone.")
     def test_tied_model_weights_key_ignore(self):
         pass
 
-    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    @unittest.skip(reason="Only checkpoints on timm can be loaded into TimmBackbone")
     def test_load_save_without_tied_weights(self):
         pass
 
-    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    @unittest.skip(reason="Only checkpoints on timm can be loaded into TimmBackbone")
     def test_model_weights_reload_no_missing_tied_weights(self):
         pass
 
-    @unittest.skip("TimmBackbone doesn't have hidden size info in its configuration.")
+    @unittest.skip(reason="TimmBackbone doesn't have hidden size info in its configuration.")
     def test_channels(self):
         pass
 
-    @unittest.skip("TimmBackbone doesn't support output_attentions.")
+    @unittest.skip(reason="TimmBackbone doesn't support output_attentions.")
     def test_torchscript_output_attentions(self):
         pass
 
-    @unittest.skip("Safetensors is not supported by timm.")
+    @unittest.skip(reason="Safetensors is not supported by timm.")
     def test_can_use_safetensors(self):
         pass
 
-    @unittest.skip("Need to use a timm backbone and there is no tiny model available.")
+    @unittest.skip(reason="Need to use a timm backbone and there is no tiny model available.")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index 854ca225674..aa9e0975941 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -170,15 +170,15 @@ class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, P
         self.model_tester = TrOCRStandaloneDecoderModelTester(self, is_training=False)
         self.config_tester = ConfigTester(self, config_class=TrOCRConfig)
 
-    # not implemented currently
+    @unittest.skip(reason="Not yet implemented")
     def test_inputs_embeds(self):
         pass
 
-    # trocr has no base model
+    @unittest.skip(reason="trocr has no base model")
     def test_save_load_fast_init_from_base(self):
         pass
 
-    # trocr has no base model
+    @unittest.skip(reason="trocr has no base model")
     def test_save_load_fast_init_to_base(self):
         pass
 
@@ -189,10 +189,10 @@ class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, P
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
 
-    # decoder cannot keep gradients
+    @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
         return
 
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
     def test_left_padding_compatibility(self):
         pass
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index 63e7a2fa78c..a3ae498606a 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -320,7 +320,7 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
 
-    @unittest.skip("Gradient checkpointing is not supported by this model")
+    @unittest.skip(reason="Gradient checkpointing is not supported by this model")
     def test_training_gradient_checkpointing(self):
         pass
 
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index 2f165d349ea..151695c1c12 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -110,7 +110,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # this tokenizer
     def test_save_sentencepiece_tokenizer(self) -> None:
         if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
         # We want to verify that we will be able to save the tokenizer even if the original files that were used to
         # build the tokenizer have been deleted in the meantime.
         words, boxes = self.get_words_and_boxes()
@@ -687,7 +687,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         words, boxes = self.get_words_and_boxes_batch()
 
@@ -708,7 +708,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
 
@@ -817,7 +817,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         encoded_sequences_batch_padded_2[key],
                     )
 
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
     def test_batch_encode_plus_overflowing_tokens(self):
         pass
 
@@ -878,7 +878,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     words, boxes = self.get_words_and_boxes()
 
@@ -919,7 +919,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1008,7 +1008,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname)
 
-    @unittest.skip("Not implemented")
+    @unittest.skip(reason="Not implemented")
     def test_right_and_left_truncation(self):
         pass
 
@@ -1153,11 +1153,11 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Assert there is online added_tokens special_tokens
                 self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template(self):
         pass
 
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
+    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
@@ -1174,13 +1174,13 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__} not in MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is an encoder-decoder or has no padding token set.")
 
                 model = model_class(config)
 
@@ -1206,11 +1206,11 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1228,7 +1228,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1282,7 +1282,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1478,7 +1478,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -1515,7 +1515,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -1628,7 +1628,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1722,27 +1722,27 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                         self.assertEqual(len(tokens[key].shape), 3)
                         self.assertEqual(tokens[key].shape[-1], 4)
 
-    @unittest.skip("TO DO: overwrite this very extensive test.")
+    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignement_methods(self):
         pass
 
-    @unittest.skip("UDOP tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_pair_input(self):
         pass
 
-    @unittest.skip("UDOP tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
     def test_maximum_encoding_length_single_input(self):
         pass
 
-    @unittest.skip("UDOP tokenizer requires boxes besides sequences.")
+    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
     def test_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("UDOP tokenizer always expects pretokenized inputs.")
+    @unittest.skip(reason="UDOP tokenizer always expects pretokenized inputs.")
     def test_compare_pretokenized_inputs(self):
         pass
 
-    @unittest.skip("UDOP fast tokenizer does not support prepare_for_model")
+    @unittest.skip(reason="UDOP fast tokenizer does not support prepare_for_model")
     def test_compare_prepare_for_model(self):
         pass
 
@@ -1863,15 +1863,15 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
-    @unittest.skip("Doesn't support another framework than PyTorch")
+    @unittest.skip(reason="Doesn't support another framework than PyTorch")
     def test_np_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("Doesn't use SentencePiece")
+    @unittest.skip(reason="Doesn't use SentencePiece")
     def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
         pass
 
-    @unittest.skip("Doesn't use SentencePiece")
+    @unittest.skip(reason="Doesn't use SentencePiece")
     def test_sentencepiece_tokenize_and_decode(self):
         pass
 
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 15010012aef..2bb841e65e6 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -331,7 +331,7 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
 
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            self.skipTest(reason="torch fx is not available or not compatible with this model")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.return_dict = False
@@ -483,7 +483,7 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
 
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         model = UMT5Model(config_and_inputs[0]).to(torch_device)
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index a8931617c5e..1804e2c95ef 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -354,21 +354,22 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
     # UniSpeech has no inputs_embeds
+    @unittest.skip(reason="UniSpeech has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
     # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="UniSpeech has no inputs_embeds")
     def test_forward_signature(self):
         pass
 
     # UniSpeech cannot resize token embeddings
     # since it has no tokens embeddings
+    @unittest.skip(reason="UniSpeech has no tokens embeds")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # UniSpeech has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="UniSpeech has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index 9ed41944660..f3d467f0795 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -403,22 +403,19 @@ class UniSpeechSatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # UniSpeechSat has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # UniSpeechSat cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # UniSpeechSat has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -615,22 +612,19 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # UniSpeechSat has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # UniSpeechSat cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # UniSpeechSat has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index dfcd296edbd..801990331fe 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -245,7 +245,7 @@ class VideoMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
 
     def test_attention_outputs(self):
         if not self.has_attentions:
-            pass
+            self.skipTest(reason="Model does not have attentions")
 
         else:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 4b6cb0bb596..b7c2c604522 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -272,7 +272,7 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False.")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -296,7 +296,7 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def test_training_gradient_checkpointing(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="model_tester.is_training is set to False.")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -547,11 +547,11 @@ class ViltForImagesAndTextClassificationModelTest(ViltModelTest, unittest.TestCa
         self.model_tester = ViltModelTester(self, modality_type_vocab_size=3, add_multiple_images=True, num_images=2)
         self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=37)
 
-    @unittest.skip("We only test the model that takes in multiple images")
+    @unittest.skip(reason="We only test the model that takes in multiple images")
     def test_model(self):
         pass
 
-    @unittest.skip("We only test the model that takes in multiple images")
+    @unittest.skip(reason="We only test the model that takes in multiple images")
     def test_for_token_classification(self):
         pass
 
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 963860725b7..e5bc88d5bfb 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -642,7 +642,7 @@ class Swin2BartModelTest(EncoderDecoderMixin, unittest.TestCase):
             (decoder_config.num_attention_heads, cross_attention_input_seq_len, encoder_seq_len),
         )
 
-    # there are no published pretrained BART-causal checkpoints for now
+    @unittest.skip(reason="There are no published pretrained BART-causal checkpoints for now")
     def test_real_model_save_load_from_pretrained(self):
         pass
 
@@ -677,7 +677,7 @@ class ViT2TrOCR(EncoderDecoderMixin, unittest.TestCase):
             "labels": decoder_input_ids,
         }
 
-    # there are no published pretrained TrOCR checkpoints for now
+    @unittest.skip(reason="There are no published pretrained TrOCR checkpoints for now")
     def test_real_model_save_load_from_pretrained(self):
         pass
 
@@ -799,7 +799,7 @@ class LayoutLMv32TrOCR(EncoderDecoderMixin, unittest.TestCase):
         )
         self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
 
-    @unittest.skip("There are no published pretrained TrOCR checkpoints for now")
+    @unittest.skip(reason="There are no published pretrained TrOCR checkpoints for now")
     def test_real_model_save_load_from_pretrained(self):
         pass
 
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index 88e06178e5d..e30d0ff01bf 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -429,7 +429,7 @@ class DeiTRobertaModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
             "text_choice_labels": choice_labels,
         }
 
-    # skip as DeiT is not available in Flax
+    @unittest.skip(reason="DeiT is not available in Flax")
     def test_pt_flax_equivalence(self):
         pass
 
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index a994f50ee32..a9690eee23b 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -179,21 +179,21 @@ class VitDetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         super().test_initialization()
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_cpu_offload(self):
         super().test_cpu_offload()
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_disk_offload_bin(self):
         super().test_disk_offload()
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_disk_offload_safetensors(self):
         super().test_disk_offload()
 
     # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index d43115b9c84..290f31c3641 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -181,7 +181,7 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_pipeline_feature_extraction(self):
         super().test_pipeline_feature_extraction()
 
-    @unittest.skip("Need to fix this after #26538")
+    @unittest.skip(reason="Need to fix this after #26538")
     def test_model_forward(self):
         set_seed(12345)
         global_rng.seed(12345)
@@ -212,11 +212,11 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
             with torch.no_grad():
                 _ = model(**self._prepare_for_class(inputs_dict, model_class)).waveform
 
-    @unittest.skip("VITS is not deterministic")
+    @unittest.skip(reason="VITS is not deterministic")
     def test_determinism(self):
         pass
 
-    @unittest.skip("VITS is not deterministic")
+    @unittest.skip(reason="VITS is not deterministic")
     def test_batching_equivalence(self):
         pass
 
@@ -260,11 +260,11 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @unittest.skip("VITS has no inputs_embeds")
+    @unittest.skip(reason="VITS has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("VITS has no input embeddings")
+    @unittest.skip(reason="VITS has no input embeddings")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py
index a71c5a05c49..f4a9c8a7438 100644
--- a/tests/models/vits/test_tokenization_vits.py
+++ b/tests/models/vits/test_tokenization_vits.py
@@ -62,15 +62,15 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
         ids = tokenizer.encode(txt, add_special_tokens=False)
         return txt, ids
 
-    @unittest.skip("Adding multicharacter tokens does not work with the VITS tokenizer")
+    @unittest.skip(reason="Adding multicharacter tokens does not work with the VITS tokenizer")
     def test_add_tokens_tokenizer(self):
         pass
 
-    @unittest.skip("Adding multicharacter tokens does not work with the VITS tokenizer")
+    @unittest.skip(reason="Adding multicharacter tokens does not work with the VITS tokenizer")
     def test_encode_decode_with_spaces(self):
         pass
 
-    @unittest.skip("The VITS tokenizer does not support `is_split_into_words`")
+    @unittest.skip(reason="The VITS tokenizer does not support `is_split_into_words`")
     def test_pretokenized_inputs(self):
         pass
 
@@ -101,7 +101,7 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
 
                 shutil.rmtree(tmpdirname)
 
-    @unittest.skip("Adding multicharacter tokens does not work the VITS tokenizer")
+    @unittest.skip(reason="Adding multicharacter tokens does not work the VITS tokenizer")
     def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
         pass
 
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 9db852f014e..51d105a5ee3 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -553,32 +553,29 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeds")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
     @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
+    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
     def test_equivalence_flax_to_pt(self):
         pass
 
     @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
+    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
     def test_equivalence_pt_to_flax(self):
         pass
 
@@ -729,10 +726,10 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     # Wav2Vec2 cannot be torchscripted because of group norm.
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         # TODO: fix it
-        self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
+        self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
 
         if not is_torch_fx_available() or not self.fx_compatible:
-            return
+            self.skipTest(reason="torch fx not available or not compatible with this model")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.return_dict = False
@@ -907,22 +904,19 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no token embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index 6c98e0e0c8a..88920638277 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -762,11 +762,11 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(tokens[0], tokenizer.eos_token_id)
                 self.assertEqual(tokens[-3], tokenizer.pad_token_id)
 
-    @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
+    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
     def test_tf_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
+    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
     def test_torch_encode_plus_sent_to_model(self):
         pass
 
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index f119cd3dc2a..ae13a8ecba9 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -512,32 +512,29 @@ class Wav2Vec2ConformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Wav2Vec2Conformer has no inputs_embeds
+    @unittest.skip(reason="Wav2Vec2Conformer has not inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Wav2Vec2Conformer has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Wav2Vec2Conformer cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Wav2Vec2Conformer has not token embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2Conformer has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Wav2Vec2Conformer has not inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
     @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
+    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
     def test_equivalence_flax_to_pt(self):
         pass
 
     @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
+    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
     def test_equivalence_pt_to_flax(self):
         pass
 
diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
index d3207cee1db..4aab34207a6 100644
--- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
+++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
@@ -325,19 +325,21 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
         outputs_char = [tokenizer.decode(ids, output_char_offsets=True) for ids in sample_ids]
         check_list_tuples_equal(outputs_char_batch, outputs_char)
 
-    @unittest.skip("Wav2Vec2PhonemeTokenizer always lower cases letters to correctly map to phonemes")
+    @unittest.skip(reason="Wav2Vec2PhonemeTokenizer always lower cases letters to correctly map to phonemes")
     def test_added_tokens_do_lower_case(self):
         pass
 
-    @unittest.skip("Wav2Vec2PhonemeTokenizer always puts spaces between phonemes")
+    @unittest.skip(reason="Wav2Vec2PhonemeTokenizer always puts spaces between phonemes")
     def test_encode_decode_with_spaces(self):
         pass
 
-    @unittest.skip("encodes to text to ids, but decodes ids to phonemes -> not possible to have internal consistency")
+    @unittest.skip(
+        reason="encodes to text to ids, but decodes ids to phonemes -> not possible to have internal consistency"
+    )
     def test_internal_consistency(self):
         pass
 
-    @unittest.skip("Wav2Vec2PhonemeModel has no max model length => no testing")
+    @unittest.skip(reason="Wav2Vec2PhonemeModel has no max model length => no testing")
     def test_add_tokens_tokenizer(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
@@ -389,11 +391,11 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
                 self.assertEqual(tokens[0], tokenizer.eos_token_id)
                 self.assertEqual(tokens[-3], tokenizer.pad_token_id)
 
-    @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
+    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
     def test_tf_encode_plus_sent_to_model(self):
         pass
 
-    @unittest.skip("The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
+    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
     def test_torch_encode_plus_sent_to_model(self):
         pass
 
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index 1ec5476e879..8f4d1e850e0 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -371,22 +371,19 @@ class WavLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # WavLM has no inputs_embeds
+    @unittest.skip(reason="WavLM has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
     # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="WavLM has no input_ids")
     def test_forward_signature(self):
         pass
 
-    # WavLM cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="WavLM has no token embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # WavLM has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
     def test_model_get_set_embeddings(self):
         pass
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b749da805a5..86a89af8c13 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -504,9 +504,11 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
                 model(**inputs)[0]
 
     # training is not supported yet
+    @unittest.skip(reason="Training is not supported yet")
     def test_training(self):
         pass
 
+    @unittest.skip(reason="Training is not supported yet")
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -522,6 +524,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip
     def test_generate_with_head_masking(self):
         pass
 
@@ -736,7 +739,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is False")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -784,13 +787,13 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is False")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untie embeddings")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -827,6 +830,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
+    @unittest.skip
     def test_generate_without_input_ids(self):
         pass
 
@@ -901,7 +905,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(reason="Model does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -947,7 +951,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(reason="Model does not support flash_attention_2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -996,7 +1000,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -1096,8 +1100,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -1169,8 +1172,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -3133,8 +3135,9 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
     def test_model_parallelism(self):
         pass
 
-    # input embeds is meaningless for an encoder-only acoustic model
+    @unittest.skip(reason="Not applicable for an encoder-only acoustic model")
     def test_inputs_embeds(self):
+        # input embeds is meaningless for an encoder-only acoustic model
         pass
 
     # the equivalent test is passing the encoder outputs directly to the model
@@ -3181,6 +3184,7 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
             self.assertTrue(x is None or isinstance(x, torch.nn.Conv1d))
 
     # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeds")
     def test_resize_tokens_embeddings(self):
         pass
 
@@ -3194,8 +3198,7 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="Flax model does not exist")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -3267,8 +3270,7 @@ class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skip("Flax model does not exist")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -3562,17 +3564,16 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
             config=config, input_ids=inputs_dict["input_ids"]
         )
 
-    @unittest.skip("Generate needs input ids")
+    @unittest.skip(reason="Generate needs input ids")
     def test_generate_without_input_ids(self):
         # generate only works with input ids for whisper
         pass
 
-    @unittest.skip("Decoder can't keep attention grads")
+    @unittest.skip(reason="Decoder can't keep attention grads")
     def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
         return
 
-    @unittest.skip("The model doesn't support fast init from base")
+    @unittest.skip(reason="The model doesn't support fast init from base")
     def test_save_load_fast_init_from_base(self):
         pass
 
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index bb22a36f084..41598cf27ba 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -89,12 +89,15 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
             ["I", "Ġwas", "Ġborn", "Ġin", "Ġ9", "2000", ",", "Ġand", "Ġthis", "Ġis", "Ġfals", "Ã©", "."],  # fmt: skip
         )
 
+    @unittest.skip
     def test_tokenizer_slow_store_full_signature(self):
         pass
 
+    @unittest.skip
     def test_tokenizer_fast_store_full_signature(self):
         pass
 
+    @unittest.skip
     def test_special_tokens_initialization(self):
         # Whisper relies on specific additional special tokens, so we skip this
         # general test. In particular, this test loads fast tokenizer from slow
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index c678d304ffa..70e7bb341c7 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -186,9 +186,11 @@ class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -420,9 +422,11 @@ class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip
     def test_training(self):
         pass
 
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -596,7 +600,7 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to False")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 5669da7e263..a9db8db6e0a 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -353,7 +353,7 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         model = XGLMModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index 02c58681d10..eac3eda05da 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -150,7 +150,7 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index 8c3674460da..ae32a62a939 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -143,7 +143,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_save_pretrained(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
@@ -224,7 +224,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index e2c0f6d7e70..0d785d4a1fc 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -612,8 +612,8 @@ class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
+    @unittest.skip(reason="xlnet cannot keep gradients in attentions or hidden states")
     def test_retain_grad_hidden_states_attentions(self):
-        # xlnet cannot keep gradients in attentions or hidden states
         return
 
     # overwrite from test_modeling_common
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index e28ac437f10..5c929aeb409 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -206,8 +206,8 @@ class YolosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(reason="YOLOS does not use inputs_embeds")
     def test_inputs_embeds(self):
-        # YOLOS does not use inputs_embeds
         pass
 
     def test_model_get_set_embeddings(self):
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index f7fa4ea31bb..4cfb7e22a5d 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -353,6 +353,7 @@ class YosoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         model = YosoModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @unittest.skip(reason="This model does not output attentions")
     def test_attention_outputs(self):
         return
 
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 57040a468be..cdedf94be18 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -132,6 +132,6 @@ class AudioClassificationPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Audio classification is not implemented for TF")
+    @unittest.skip(reason="Audio classification is not implemented for TF")
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 73376ff2189..35c36aa0e66 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -70,9 +70,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
-            self.skipTest("No tokenizer available")
-            return
-            # return None, None
+            self.skipTest(reason="No tokenizer available")
 
         speech_recognizer = AutomaticSpeechRecognitionPipeline(
             model=model, tokenizer=tokenizer, feature_extractor=processor
@@ -271,7 +269,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
 
     @require_tf
     def test_small_model_tf(self):
-        self.skipTest("Tensorflow not supported yet.")
+        self.skipTest(reason="Tensorflow not supported yet.")
 
     @require_torch
     def test_torch_small_no_tokenizer_files(self):
@@ -1855,7 +1853,7 @@ def require_ffmpeg(test_case):
         subprocess.check_output(["ffmpeg", "-h"], stderr=subprocess.DEVNULL)
         return test_case
     except Exception:
-        return unittest.skip("test requires ffmpeg")(test_case)
+        return unittest.skip(reason="test requires ffmpeg")(test_case)
 
 
 def bytes_iter(chunk_size, chunks):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 2ef3e6e5ff6..d4dbff21855 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -651,7 +651,7 @@ class PipelineUtilsTest(unittest.TestCase):
         if len(relevant_auto_classes) == 0:
             # task has no default
             logger.debug(f"{task} in {framework} has no default")
-            return
+            self.skipTest(f"{task} in {framework} has no default")
 
         # by default use first class
         auto_model_cls = relevant_auto_classes[0]
diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py
new file mode 100644
index 00000000000..5b6eb514b1a
--- /dev/null
+++ b/tests/pipelines/test_pipelines_conversational.py
@@ -0,0 +1,439 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    BlenderbotSmallForConditionalGeneration,
+    BlenderbotSmallTokenizer,
+    Conversation,
+    ConversationalPipeline,
+    TFAutoModelForCausalLM,
+    pipeline,
+)
+from transformers.testing_utils import (
+    backend_empty_cache,
+    is_pipeline_test,
+    is_torch_available,
+    require_tf,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+class ConversationalPipelineTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            backend_empty_cache(torch_device)
+
+    model_mapping = dict(
+        list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items())
+        if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+        else [] + list(MODEL_FOR_CAUSAL_LM_MAPPING.items())
+        if MODEL_FOR_CAUSAL_LM_MAPPING
+        else []
+    )
+    tf_model_mapping = dict(
+        list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items())
+        if TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+        else [] + list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.items())
+        if TF_MODEL_FOR_CAUSAL_LM_MAPPING
+        else []
+    )
+
+    def get_test_pipeline(self, model, tokenizer, processor):
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        return conversation_agent, [Conversation("Hi there!")]
+
+    def run_pipeline_test(self, conversation_agent, _):
+        # Simple
+        outputs = conversation_agent(Conversation("Hi there!"), max_new_tokens=5)
+        self.assertEqual(
+            outputs,
+            Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]),
+        )
+
+        # Single list
+        outputs = conversation_agent([Conversation("Hi there!")], max_new_tokens=5)
+        self.assertEqual(
+            outputs,
+            Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]),
+        )
+
+        # Batch
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        conversation_2 = Conversation("What's the last book you have read?")
+        self.assertEqual(len(conversation_1), 1)
+        self.assertEqual(len(conversation_2), 1)
+
+        outputs = conversation_agent([conversation_1, conversation_2], max_new_tokens=5)
+        self.assertEqual(outputs, [conversation_1, conversation_2])
+        self.assertEqual(
+            outputs,
+            [
+                Conversation(
+                    [
+                        {"role": "user", "content": "Going to the movies tonight - any suggestions?"},
+                        {"role": "assistant", "content": ANY(str)},
+                    ],
+                ),
+                Conversation(
+                    [
+                        {"role": "user", "content": "What's the last book you have read?"},
+                        {"role": "assistant", "content": ANY(str)},
+                    ]
+                ),
+            ],
+        )
+
+        # One conversation with history
+        conversation_2.add_message({"role": "user", "content": "Why do you recommend it?"})
+        outputs = conversation_agent(conversation_2, max_new_tokens=5)
+        self.assertEqual(outputs, conversation_2)
+        self.assertEqual(
+            outputs,
+            Conversation(
+                [
+                    {"role": "user", "content": "What's the last book you have read?"},
+                    {"role": "assistant", "content": ANY(str)},
+                    {"role": "user", "content": "Why do you recommend it?"},
+                    {"role": "assistant", "content": ANY(str)},
+                ]
+            ),
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation(self):
+        # When
+        conversation_agent = pipeline(task="conversational", device=torch_device)
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        conversation_2 = Conversation("What's the last book you have read?")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        # When
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 1)
+        self.assertEqual(len(result[1].past_user_inputs), 1)
+        self.assertEqual(len(result[0].generated_responses), 1)
+        self.assertEqual(len(result[1].generated_responses), 1)
+        self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?")
+        self.assertEqual(result[0].generated_responses[0], "The Big Lebowski")
+        self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?")
+        self.assertEqual(result[1].generated_responses[0], "The Last Question")
+        # When
+        conversation_2.add_user_input("Why do you recommend it?")
+        result = conversation_agent(conversation_2, do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, conversation_2)
+        self.assertEqual(len(result.past_user_inputs), 2)
+        self.assertEqual(len(result.generated_responses), 2)
+        self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?")
+        self.assertEqual(result.generated_responses[1], "It's a good book.")
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_truncated_history(self):
+        # When
+        conversation_agent = pipeline(task="conversational", min_length_for_response=24, device=torch_device)
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        # When
+        result = conversation_agent(conversation_1, do_sample=False, max_length=36)
+        # Then
+        self.assertEqual(result, conversation_1)
+        self.assertEqual(len(result.past_user_inputs), 1)
+        self.assertEqual(len(result.generated_responses), 1)
+        self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?")
+        self.assertEqual(result.generated_responses[0], "The Big Lebowski")
+        # When
+        conversation_1.add_user_input("Is it an action movie?")
+        result = conversation_agent(conversation_1, do_sample=False, max_length=36)
+        # Then
+        self.assertEqual(result, conversation_1)
+        self.assertEqual(len(result.past_user_inputs), 2)
+        self.assertEqual(len(result.generated_responses), 2)
+        self.assertEqual(result.past_user_inputs[1], "Is it an action movie?")
+        self.assertEqual(result.generated_responses[1], "It's a comedy.")
+
+    @require_torch
+    def test_small_model_pt(self):
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        conversation = Conversation("hello")
+        output = conversation_agent(conversation)
+        self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"]))
+
+    @require_tf
+    def test_small_model_tf(self):
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+        model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        conversation = Conversation("hello")
+        output = conversation_agent(conversation)
+        self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"]))
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_dialogpt_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        conversation_1 = Conversation("hello")
+        inputs = conversation_agent.preprocess(conversation_1)
+        self.assertEqual(inputs["input_ids"].tolist(), [[31373, 50256]])
+
+        conversation_2 = Conversation("how are you ?", past_user_inputs=["hello"], generated_responses=["Hi there!"])
+        inputs = conversation_agent.preprocess(conversation_2)
+        self.assertEqual(
+            inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]]
+        )
+
+    @unittest.skip(reason="Model is curently gated")
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_llama2_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_default_system_prompt=True)
+
+        conversation = Conversation(
+            "What is so great about #1?",
+            past_user_inputs=["I am going to Paris, what should I see?"],
+            generated_responses=[
+                """\
+Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:
+
+1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.
+2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.
+3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.
+
+These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."""
+            ],
+        )
+        inputs = tokenizer._build_conversation_input_ids(conversation)
+        EXPECTED_INPUTS_IDS = [ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962, 3681, 29892, 278, 7483, 310, 3444, 29892, 338, 2998, 363, 967, 380, 27389, 11258, 29892, 1616, 19133, 29879, 29892, 15839, 2982, 22848, 29892, 322, 6017, 7716, 25005, 29889, 2266, 526, 777, 310, 278, 2246, 19650, 1953, 304, 1074, 297, 3681, 29901, 13, 13, 29896, 29889, 450, 382, 2593, 295, 23615, 29901, 450, 9849, 293, 382, 2593, 295, 23615, 338, 697, 310, 278, 1556, 5936, 13902, 2982, 22848, 297, 278, 3186, 322, 16688, 2078, 271, 400, 5086, 8386, 310, 278, 4272, 29889, 13, 29906, 29889, 450, 4562, 12675, 6838, 29901, 450, 4562, 12675, 338, 697, 310, 278, 3186, 29915, 29879, 10150, 322, 1556, 13834, 19133, 29879, 29892, 27261, 385, 21210, 573, 4333, 310, 1616, 322, 24238, 29879, 29892, 3704, 278, 2598, 29874, 29420, 29889, 13, 29941, 29889, 24337, 29899, 29928, 420, 315, 21471, 29901, 910, 9560, 274, 21471, 338, 697, 310, 278, 1556, 13834, 2982, 22848, 297, 3681, 322, 338, 2998, 363, 967, 22883, 293, 11258, 322, 380, 27389, 380, 7114, 12917, 5417, 29889, 13, 13, 1349, 968, 526, 925, 263, 2846, 310, 278, 1784, 19650, 1953, 393, 3681, 756, 304, 5957, 29889, 2973, 577, 1568, 304, 1074, 322, 437, 29892, 372, 29915, 29879, 694, 4997, 393, 3681, 338, 697, 310, 278, 1556, 5972, 6282, 391, 15422, 800, 297, 278, 3186, 29889, 29871, 2, 1, 518, 25580, 29962, 1724, 338, 577, 2107, 1048, 396, 29896, 29973, 518, 29914, 25580, 29962]  # fmt: skip
+        self.assertEqual(inputs, EXPECTED_INPUTS_IDS)
+
+        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        EXPECTED_TEXT = "what topic you want to focus on and create content around it. This will help you stand out from other creators and attract a specific audience.\n\nStep 2: Set Up Your Channel\nCreate your YouTube account and customize your channel with your branding and logo. Make sure your channel name and profile picture are consistent with your niche.\n\nStep 3: Plan Your Content\nDevelop a content strategy that includes the type of content you want to create, how often you will post, and when you will post. Consider creating a content calendar to help you stay organized.\n\nStep 4: Invest in Quality Equipment\nInvest in good quality camera and microphone equipment to ensure your videos look and sound professional. You don't need to break the bank, but investing in good equipment will make a big difference in the quality of your videos.\n\nStep 5: Optimize Your Videos for Search\nUse keywords in your video titles, descriptions, and tags to help people find your videos when they search for topics related to your niche"
+        conversation = Conversation(
+            "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 steps?"
+        )
+        result = conversation_agent(conversation)
+        self.assertEqual(result.generated_responses[-1], EXPECTED_TEXT)
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_blenderbot_400M_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        # test1
+        conversation_1 = Conversation("hello")
+        inputs = conversation_agent.preprocess(conversation_1)
+        self.assertEqual(inputs["input_ids"].tolist(), [[1710, 86, 2]])
+
+        # test2
+        conversation_1 = Conversation(
+            "I like lasagne.",
+            past_user_inputs=["hello"],
+            generated_responses=[
+                " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie."
+            ],
+        )
+        inputs = conversation_agent.preprocess(conversation_1)
+        self.assertEqual(
+            inputs["input_ids"].tolist(),
+            [
+                # This should be compared with the same conversation on ParlAI `safe_interactive` demo.
+                [
+                    1710,  # hello
+                    86,
+                    228,  # Double space
+                    228,
+                    946,
+                    304,
+                    398,
+                    6881,
+                    558,
+                    964,
+                    38,
+                    452,
+                    315,
+                    265,
+                    6252,
+                    452,
+                    322,
+                    968,
+                    6884,
+                    3146,
+                    278,
+                    306,
+                    265,
+                    617,
+                    87,
+                    388,
+                    75,
+                    341,
+                    286,
+                    521,
+                    21,
+                    228,  # Double space
+                    228,
+                    281,  # I like lasagne.
+                    398,
+                    6881,
+                    558,
+                    964,
+                    21,
+                    2,  # EOS
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_blenderbot_400M(self):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        conversation_1 = Conversation("hello")
+        result = conversation_agent(
+            conversation_1,
+        )
+        self.assertEqual(
+            result.generated_responses[0],
+            # ParlAI implementation output, we have a different one, but it's our
+            # second best, you can check by using num_return_sequences=10
+            # " Hello! How are you? I'm just getting ready to go to work, how about you?",
+            " Hello! How are you doing today? I just got back from a walk with my dog.",
+        )
+
+        conversation_1 = Conversation("Lasagne   hello")
+        result = conversation_agent(conversation_1, encoder_no_repeat_ngram_size=3)
+        self.assertEqual(
+            result.generated_responses[0],
+            " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie.",
+        )
+
+        conversation_1 = Conversation(
+            "Lasagne   hello   Lasagne is my favorite Italian dish. Do you like lasagne?   I like lasagne."
+        )
+        result = conversation_agent(
+            conversation_1,
+            encoder_no_repeat_ngram_size=3,
+        )
+        self.assertEqual(
+            result.generated_responses[0],
+            " Me too. I like how it can be topped with vegetables, meats, and condiments.",
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_encoder_decoder(self):
+        # When
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot_small-90M")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer, device=torch_device)
+
+        conversation_1 = Conversation("My name is Sarah and I live in London")
+        conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        # When
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 1)
+        self.assertEqual(len(result[1].past_user_inputs), 1)
+        self.assertEqual(len(result[0].generated_responses), 1)
+        self.assertEqual(len(result[1].generated_responses), 1)
+        self.assertEqual(result[0].past_user_inputs[0], "My name is Sarah and I live in London")
+        self.assertEqual(
+            result[0].generated_responses[0],
+            "hi sarah, i live in london as well. do you have any plans for the weekend?",
+        )
+        self.assertEqual(
+            result[1].past_user_inputs[0], "Going to the movies tonight, What movie would you recommend? "
+        )
+        self.assertEqual(
+            result[1].generated_responses[0], "i don't know... i'm not really sure. what movie are you going to see?"
+        )
+        # When
+        conversation_1.add_user_input("Not yet, what about you?")
+        conversation_2.add_user_input("What's your name?")
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 2)
+        self.assertEqual(len(result[1].past_user_inputs), 2)
+        self.assertEqual(len(result[0].generated_responses), 2)
+        self.assertEqual(len(result[1].generated_responses), 2)
+        self.assertEqual(result[0].past_user_inputs[1], "Not yet, what about you?")
+        self.assertEqual(result[0].generated_responses[1], "i don't have any plans yet. i'm not sure what to do yet.")
+        self.assertEqual(result[1].past_user_inputs[1], "What's your name?")
+        self.assertEqual(result[1].generated_responses[1], "i don't have a name, but i'm going to see a horror movie.")
+
+    @require_torch
+    @slow
+    def test_from_pipeline_conversation(self):
+        model_id = "facebook/blenderbot_small-90M"
+
+        # from model id
+        conversation_agent_from_model_id = pipeline("conversational", model=model_id, tokenizer=model_id)
+
+        # from model object
+        model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_id)
+        tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_id)
+        conversation_agent_from_model = pipeline("conversational", model=model, tokenizer=tokenizer)
+
+        conversation = Conversation("My name is Sarah and I live in London")
+        conversation_copy = Conversation("My name is Sarah and I live in London")
+
+        result_model_id = conversation_agent_from_model_id([conversation])
+        result_model = conversation_agent_from_model([conversation_copy])
+
+        # check for equality
+        self.assertEqual(
+            result_model_id.generated_responses[0],
+            "hi sarah, i live in london as well. do you have any plans for the weekend?",
+        )
+        self.assertEqual(
+            result_model_id.generated_responses[0],
+            result_model.generated_responses[0],
+        )
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index abc58ca710b..259ab5ef4c3 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -95,7 +95,7 @@ class DepthEstimationPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Depth estimation is not implemented in TF")
+    @unittest.skip(reason="Depth estimation is not implemented in TF")
     def test_small_model_tf(self):
         pass
 
@@ -115,4 +115,4 @@ class DepthEstimationPipelineTests(unittest.TestCase):
     @require_torch
     def test_small_model_pt(self):
         # This is highly irregular to have no small tests.
-        self.skipTest("There is not hf-internal-testing tiny model for either GLPN nor DPT")
+        self.skipTest(reason="There is not hf-internal-testing tiny model for either GLPN nor DPT")
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index 81febbc8c17..d1fd87e18e3 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -366,6 +366,6 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
         self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}])
 
     @require_tf
-    @unittest.skip("Document question answering not implemented in TF")
+    @unittest.skip(reason="Document question answering not implemented in TF")
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index 87c5a151175..ff5f8314b65 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -176,15 +176,15 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
 
     def get_test_pipeline(self, model, tokenizer, processor):
         if tokenizer is None:
-            self.skipTest("No tokenizer")
-            return
+            self.skipTest(reason="No tokenizer")
         elif (
             type(model.config) in FEATURE_EXTRACTOR_MAPPING
             or isinstance(model.config, LxmertConfig)
             or type(model.config) in IMAGE_PROCESSOR_MAPPING
         ):
-            self.skipTest("This is a bimodal model, we need to find a more consistent way to switch on those models.")
-            return
+            self.skipTest(
+                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
+            )
         elif model.config.is_encoder_decoder:
             self.skipTest(
                 """encoder_decoder models are trickier for this pipeline.
@@ -193,8 +193,6 @@ class FeatureExtractionPipelineTests(unittest.TestCase):
                 For now ignore those.
                 """
             )
-
-            return
         feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor)
         return feature_extractor, ["This is a test", "This is another test"]
 
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index bbf2b6cf3f4..93dacbd15bf 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -253,7 +253,7 @@ class FillMaskPipelineTests(unittest.TestCase):
 
     def get_test_pipeline(self, model, tokenizer, processor):
         if tokenizer is None or tokenizer.mask_token_id is None:
-            self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
+            self.skipTest(reason="The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
         fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
         examples = [
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 1519c7a9780..53af000d6de 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -159,10 +159,12 @@ class ImageFeatureExtractionPipelineTests(unittest.TestCase):
 
     def get_test_pipeline(self, model, tokenizer, processor):
         if processor is None:
-            self.skipTest("No image processor")
+            self.skipTest(reason="No image processor")
 
         elif type(model.config) in TOKENIZER_MAPPING:
-            self.skipTest("This is a bimodal model, we need to find a more consistent way to switch on those models.")
+            self.skipTest(
+                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
+            )
 
         elif model.config.is_encoder_decoder:
             self.skipTest(
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 6546df2a1b9..8f2ae47f14b 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -169,7 +169,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Image segmentation not implemented in TF")
+    @unittest.skip(reason="Image segmentation not implemented in TF")
     def test_small_model_tf(self):
         pass
 
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index c9a44a53548..643ee84e683 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -74,12 +74,12 @@ class MaskGenerationPipelineTests(unittest.TestCase):
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
         ]
 
-    # TODO: Implement me @Arthur
+    @unittest.skip(reason="TODO @Arthur: Implement me")
     def run_pipeline_test(self, mask_generator, examples):
         pass
 
     @require_tf
-    @unittest.skip("Image segmentation not implemented in TF")
+    @unittest.skip(reason="Image segmentation not implemented in TF")
     def test_small_model_tf(self):
         pass
 
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index ec4984b76f9..76a6ab807cd 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -103,7 +103,7 @@ class ObjectDetectionPipelineTests(unittest.TestCase):
                 )
 
     @require_tf
-    @unittest.skip("Object detection not implemented in TF")
+    @unittest.skip(reason="Object detection not implemented in TF")
     def test_small_model_tf(self):
         pass
 
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 4c91fd46cd9..00ddd77f826 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -404,7 +404,7 @@ class TextGenerationPipelineTests(unittest.TestCase):
             # TF generation does not support max_new_tokens, and it's impossible
             # to control long generation with only max_length without
             # fancy calculation, dismissing tests for now.
-            return
+            self.skipTest(reason="TF generation does not support max_new_tokens")
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index d23916bad84..392d3b31b4c 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -94,5 +94,6 @@ class VideoClassificationPipelineTests(unittest.TestCase):
         )
 
     @require_tf
+    @unittest.skip
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index 776046e160c..e056adee233 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -236,6 +236,6 @@ class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Visual question answering not implemented in TF")
+    @unittest.skip(reason="Visual question answering not implemented in TF")
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 09b2f56f980..60562fe7aa1 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -40,7 +40,7 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
             [{"score": 0.501, "label": "Sound of a dog"}, {"score": 0.499, "label": "Sound of vaccum cleaner"}],
         )
 
-    @unittest.skip("No models are available in TF")
+    @unittest.skip(reason="No models are available in TF")
     def test_small_model_tf(self):
         pass
 
@@ -89,6 +89,6 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
             * 5,
         )
 
-    @unittest.skip("No models are available in TF")
+    @unittest.skip(reason="No models are available in TF")
     def test_large_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index 7adae8ee962..5c3208866ee 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -49,7 +49,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
     #     if tokenizer is None:
     #         # Side effect of no Fast Tokenizer class for these model, so skipping
     #         # But the slow tokenizer test should still run as they're quite small
-    #         self.skipTest("No tokenizer available")
+    #         self.skipTest(reason="No tokenizer available")
     #         return
     #         # return None, None
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c8b424483fa..065e5c211e6 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -74,7 +74,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Zero Shot Object Detection not implemented in TF")
+    @unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
     def test_small_model_tf(self):
         pass
 
@@ -185,7 +185,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
         )
 
     @require_tf
-    @unittest.skip("Zero Shot Object Detection not implemented in TF")
+    @unittest.skip(reason="Zero Shot Object Detection not implemented in TF")
     def test_large_model_tf(self):
         pass
 
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index aaab7226c87..58678ade57f 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -450,7 +450,7 @@ class AwqFusedTest(unittest.TestCase):
         outputs = model.generate(**inputs, max_new_tokens=12)
         self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
 
-    @unittest.skip("Not enough GPU memory on CI runners")
+    @unittest.skip(reason="Not enough GPU memory on CI runners")
     @require_torch_multi_gpu
     def test_generation_mixtral_fused(self):
         """
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index ac17979d175..71a2d7c8157 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -496,7 +496,7 @@ class Bnb4BitTestTraining(Base4bitTest):
 
     def test_training(self):
         if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
-            return
+            self.skipTest(reason="This test requires bitsandbytes >= 0.37.0")
 
         # Step 1: freeze all parameters
         model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 8043a1201b7..ca3f043c749 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -826,7 +826,7 @@ class MixedInt8TestTraining(BaseMixedInt8Test):
 
     def test_training(self):
         if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
-            return
+            self.skipTest(reason="This test requires bitsandbytes>=0.37.0")
 
         # Step 1: freeze all parameters
         model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index fef5c30018f..d8f4fffb8d2 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -332,20 +332,23 @@ class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
         "lm_head": 0,
     }
 
-    # the execution device is a gpu
+    @unittest.skip(reason="The execution device is a gpu")
     def test_generate_quality_cpu(self):
         pass
 
-    # we can't save offloaded values
+    @unittest.skip(reason="We can't save offloaded values")
     def test_serialization_bin(self):
         pass
 
+    @unittest.skip
     def test_serialization_safetensors(self):
         pass
 
+    @unittest.skip
     def test_compare_with_quanto(self):
         pass
 
+    @unittest.skip
     def test_load_from_quanto_saved(self):
         pass
 
@@ -370,7 +373,7 @@ class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
             )
 
 
-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
     """
     Perform the same tests as in QuantoQuantizationTest but with a serialized model.
@@ -403,7 +406,7 @@ class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
         )
 
 
-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
     """
     Perform the same tests as in QuantoQuantizationTest but with model on cuda
@@ -422,7 +425,7 @@ class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
     weights = "int4"
 
 
-@unittest.skip("Skipping test class because serialization is not supported yet")
+@unittest.skip(reason="Skipping test class because serialization is not supported yet")
 class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
     EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
     weights = "int4"
diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
index 0b194417bb5..6924bf482f1 100644
--- a/tests/test_cache_utils.py
+++ b/tests/test_cache_utils.py
@@ -452,6 +452,6 @@ class CacheIntegrationTest(unittest.TestCase):
         decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
         self.assertListEqual(decoded, EXPECTED_GENERATION)
 
-    @unittest.skip("TODO @gante static cache's does not support beam search yet")
+    @unittest.skip(reason="TODO @gante static cache's does not support beam search yet")
     def test_static_cache_beam_search(self):
         pass
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index a36da4012b6..4971b814d31 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -161,10 +161,10 @@ class ImageProcessingTestMixin:
         )
 
         if not self.test_slow_image_processor or not self.test_fast_image_processor:
-            self.skipTest("Skipping slow/fast equivalence test")
+            self.skipTest(reason="Skipping slow/fast equivalence test")
 
         if self.image_processing_class is None or self.fast_image_processing_class is None:
-            self.skipTest("Skipping slow/fast equivalence test as one of the image processors is not defined")
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
 
         image_processor_slow = self.image_processing_class(**self.image_processor_dict)
         image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
@@ -178,10 +178,10 @@ class ImageProcessingTestMixin:
     @require_torch
     def test_fast_is_faster_than_slow(self):
         if not self.test_slow_image_processor or not self.test_fast_image_processor:
-            self.skipTest("Skipping speed test")
+            self.skipTest(reason="Skipping speed test")
 
         if self.image_processing_class is None or self.fast_image_processing_class is None:
-            self.skipTest("Skipping speed test as one of the image processors is not defined")
+            self.skipTest(reason="Skipping speed test as one of the image processors is not defined")
 
         def measure_time(image_processor, image):
             start = time.time()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index f7f0db79a8d..11c34462ba5 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -298,7 +298,7 @@ class ModelTesterMixin:
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             if model_class._keep_in_fp32_modules is None:
-                return
+                self.skipTest(reason="Model class has no _keep_in_fp32_modules attribute defined")
 
             model = model_class(config)
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -392,7 +392,8 @@ class ModelTesterMixin:
     def test_save_load_fast_init_from_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -522,94 +523,11 @@ class ModelTesterMixin:
 
         self.assertEqual(tied_params1, tied_params2)
 
-    def test_fast_init_context_manager(self):
-        # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__
-        class MyClass(PreTrainedModel):
-            config_class = PretrainedConfig
-
-            def __init__(self, config=None):
-                super().__init__(config if config is not None else PretrainedConfig())
-                self.linear = nn.Linear(10, 10, bias=True)
-                self.embedding = nn.Embedding(10, 10)
-                self.std = 1
-
-            def _init_weights(self, module):
-                if isinstance(module, nn.Linear):
-                    module.weight.data = nn.init.kaiming_uniform_(module.weight.data, np.sqrt(5))
-                    if module.bias is not None:
-                        module.bias.data.normal_(mean=0.0, std=self.std)
-
-        # 2. Make sure a linear layer's reset params is properly skipped:
-        with ContextManagers([no_init_weights(True)]):
-            no_init_instance = MyClass()
-
-        set_seed(0)
-        expected_bias = torch.tensor(
-            ([0.2975, 0.2131, -0.1379, -0.0796, -0.3012, -0.0057, -0.2381, -0.2439, -0.0174, 0.0475])
-        )
-        init_instance = MyClass()
-        torch.testing.assert_close(init_instance.linear.bias, expected_bias, rtol=1e-3, atol=1e-4)
-
-        set_seed(0)
-        torch.testing.assert_close(
-            init_instance.linear.weight, nn.init.kaiming_uniform_(no_init_instance.linear.weight, np.sqrt(5))
-        )
-
-        # 3. Make sure weights that are not present use init_weight_ and get expected values
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            state_dict = init_instance.state_dict()
-            del state_dict["linear.weight"]
-
-            init_instance.config.save_pretrained(tmpdirname)
-            torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-            set_seed(0)
-            model_fast_init = MyClass.from_pretrained(tmpdirname)
-
-            set_seed(0)
-            model_slow_init = MyClass.from_pretrained(tmpdirname, _fast_init=False)
-
-            for key in model_fast_init.state_dict().keys():
-                max_diff = torch.max(torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]))
-                self.assertLessEqual(max_diff.item(), 1e-3, msg=f"{key} not identical")
-
-    def test_fast_init_tied_embeddings(self):
-        class MyClass(PreTrainedModel):
-            config_class = PretrainedConfig
-            _tied_weights_keys = ["output_embeddings.weight"]
-
-            def __init__(self, config=None):
-                super().__init__(config if config is not None else PretrainedConfig())
-                self.input_embeddings = nn.Embedding(10, 10)
-                self.output_embeddings = nn.Linear(10, 10, bias=False)
-                self.tie_weights()
-
-            def get_output_embeddings(self):
-                return self.output_embeddings
-
-            def set_output_embeddings(self, output_embeddings):
-                self.output_embeddings = output_embeddings
-
-            def get_input_embeddings(self):
-                return self.input_embeddings
-
-            def set_input_embeddings(self, input_embeddings):
-                self.input_embeddings = input_embeddings
-
-            def _init_weights(self, module):
-                if module is self.output_embeddings:
-                    raise ValueError("unnecessarily initialized tied output embedding!")
-
-        model = MyClass()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            # throws if it initializes the tied output_embeddings
-            MyClass.from_pretrained(tmpdirname)
-
     def test_save_load_fast_init_to_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -664,7 +582,8 @@ class ModelTesterMixin:
     def test_torch_save_load(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if config.__class__ not in MODEL_MAPPING:
-            return
+            self.skipTest(reason="Model class not in MODEL_MAPPING")
+
         base_class = MODEL_MAPPING[config.__class__]
 
         if isinstance(base_class, tuple):
@@ -748,38 +667,6 @@ class ModelTesterMixin:
             else:
                 check_determinism(first, second)
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and self.has_attentions:
-                expected_arg_names = ["pixel_values", "output_hidden_states", "output_attentions", "return_dict"]
-                self.assertListEqual(arg_names, expected_arg_names)
-            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and not self.has_attentions:
-                expected_arg_names = ["pixel_values", "output_hidden_states", "return_dict"]
-                self.assertListEqual(arg_names, expected_arg_names)
-            else:
-                expected_arg_names = [model.main_input_name]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_batching_equivalence(self):
         """
         Tests that the model supports batching and that the output is the nearly the same for the same input in
@@ -875,7 +762,7 @@ class ModelTesterMixin:
 
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not configured to run training tests")
 
         for model_class in self.all_model_classes:
             if (
@@ -914,7 +801,7 @@ class ModelTesterMixin:
 
     def test_training(self):
         if not self.model_tester.is_training:
-            return
+            self.skipTest(reason="ModelTester is not configured to run training tests")
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1095,7 +982,7 @@ class ModelTesterMixin:
 
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to `False`")
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
@@ -1157,7 +1044,7 @@ class ModelTesterMixin:
                             if "attention_mask" in inputs:
                                 trace_input["attention_mask"] = inputs["attention_mask"]
                             else:
-                                self.skipTest("testing SDPA without attention_mask is not supported")
+                                self.skipTest(reason="testing SDPA without attention_mask is not supported")
 
                             model(main_input, attention_mask=inputs["attention_mask"])
                             # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
@@ -1369,7 +1256,7 @@ class ModelTesterMixin:
 
     def test_headmasking(self):
         if not self.test_head_masking:
-            return
+            self.skipTest(reason="Model does not support head masking")
 
         global_rng.seed(42)
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1439,7 +1326,7 @@ class ModelTesterMixin:
 
     def test_head_pruning(self):
         if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")
 
         for model_class in self.all_model_classes:
             (
@@ -1472,7 +1359,7 @@ class ModelTesterMixin:
 
     def test_head_pruning_save_load_from_pretrained(self):
         if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")
 
         for model_class in self.all_model_classes:
             (
@@ -1509,7 +1396,7 @@ class ModelTesterMixin:
 
     def test_head_pruning_save_load_from_config_init(self):
         if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")
 
         for model_class in self.all_model_classes:
             (
@@ -1544,7 +1431,7 @@ class ModelTesterMixin:
 
     def test_head_pruning_integration(self):
         if not self.test_pruning:
-            return
+            self.skipTest(reason="Pruning is not activated")
 
         for model_class in self.all_model_classes:
             (
@@ -1733,7 +1620,7 @@ class ModelTesterMixin:
 
     def test_resize_position_vector_embeddings(self):
         if not self.test_resize_position_embeddings:
-            return
+            self.skipTest(reason="Model does not have position embeddings")
 
         (
             original_config,
@@ -1816,7 +1703,7 @@ class ModelTesterMixin:
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to `False`")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -1916,13 +1803,13 @@ class ModelTesterMixin:
             inputs_dict,
         ) = self.model_tester.prepare_config_and_inputs_for_common()
         if not self.test_resize_embeddings:
-            return
+            self.skipTest(reason="test_resize_embeddings is set to `False`")
 
         original_config.tie_word_embeddings = False
 
         # if model cannot untied embeddings -> leave test
         if original_config.tie_word_embeddings:
-            return
+            self.skipTest(reason="Model cannot untied embeddings")
 
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
@@ -1994,7 +1881,7 @@ class ModelTesterMixin:
 
     def test_correct_missing_keys(self):
         if not self.test_missing_keys:
-            return
+            self.skipTest(reason="test_missing_keys is set to `False`")
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -2022,7 +1909,7 @@ class ModelTesterMixin:
 
     def test_tie_model_weights(self):
         if not self.test_torchscript:
-            return
+            self.skipTest(reason="test_torchscript is set to `False`")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -2481,8 +2368,7 @@ class ModelTesterMixin:
 
             tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
             if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have this model in TF version yet
-                return
+                self.skipTest(reason="transformers does not have TF version of this model yet")
 
             # Output all for aggressive testing
             config.output_hidden_states = True
@@ -2664,8 +2550,7 @@ class ModelTesterMixin:
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -2736,8 +2621,7 @@ class ModelTesterMixin:
                 fx_model_class_name = "Flax" + model_class.__name__
 
                 if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
+                    self.skipTest(reason="No Flax model exists for this class")
 
                 # Output all for aggressive testing
                 config.output_hidden_states = True
@@ -2849,7 +2733,7 @@ class ModelTesterMixin:
 
             model_forward_args = inspect.signature(model.forward).parameters
             if "inputs_embeds" not in model_forward_args:
-                self.skipTest("This model doesn't use `inputs_embeds`")
+                self.skipTest(reason="This model doesn't use `inputs_embeds`")
 
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
             pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
@@ -2910,7 +2794,7 @@ class ModelTesterMixin:
     @require_torch_multi_gpu
     def test_model_parallelization(self):
         if not self.test_model_parallel:
-            return
+            self.skipTest(reason="test_model_parallel is set to False")
 
         # a candidate for testing_utils
         def get_current_gpu_memory_use():
@@ -2972,7 +2856,7 @@ class ModelTesterMixin:
     @require_torch_multi_gpu
     def test_model_parallel_equal_results(self):
         if not self.test_model_parallel:
-            return
+            self.skipTest(reason="test_model_parallel is set to False")
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3221,7 +3105,7 @@ class ModelTesterMixin:
 
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
-            return
+            self.skipTest(reason="test_missmatched_shapes is set to False")
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -3265,7 +3149,7 @@ class ModelTesterMixin:
 
     def test_mismatched_shapes_have_properly_initialized_weights(self):
         if not self.test_mismatched_shapes:
-            return
+            self.skipTest(reason="test_missmatched_shapes is set to False")
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         configs_no_init = _config_zero_init(config)
@@ -3383,6 +3267,9 @@ class ModelTesterMixin:
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_conversion(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -3409,6 +3296,9 @@ class ModelTesterMixin:
     @slow
     @is_flaky()
     def test_flash_attn_2_inference_equivalence(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3503,6 +3393,9 @@ class ModelTesterMixin:
     @slow
     @is_flaky()
     def test_flash_attn_2_inference_equivalence_right_padding(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3593,6 +3486,9 @@ class ModelTesterMixin:
     @slow
     @is_flaky()
     def test_flash_attn_2_generate_left_padding(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3638,6 +3534,9 @@ class ModelTesterMixin:
     @is_flaky()
     @slow
     def test_flash_attn_2_generate_padding_right(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -3681,6 +3580,9 @@ class ModelTesterMixin:
     @require_torch_sdpa
     @slow
     def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         if not self.all_model_classes[0]._supports_sdpa:
             self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
 
@@ -3979,11 +3881,14 @@ class ModelTesterMixin:
     @require_torch_gpu
     @slow
     def test_sdpa_can_dispatch_on_flash(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         compute_capability = torch.cuda.get_device_capability()
         major, _ = compute_capability
 
         if not torch.version.cuda or major < 8:
-            self.skipTest("This test requires an NVIDIA GPU with compute capability >= 8.0")
+            self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
 
         for model_class in self.all_model_classes:
             if not model_class._supports_sdpa:
@@ -3992,13 +3897,15 @@ class ModelTesterMixin:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]:
-                self.skipTest("Llava-like models currently (transformers==4.39.1) requires an attention_mask input")
+                self.skipTest(
+                    reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input"
+                )
             if config.model_type in ["paligemma"]:
                 self.skipTest(
                     "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
                 )
             if config.model_type in ["idefics"]:
-                self.skipTest("Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
+                self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -4020,11 +3927,14 @@ class ModelTesterMixin:
     @require_torch_gpu
     @slow
     def test_sdpa_can_compile_dynamic(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         compute_capability = torch.cuda.get_device_capability()
         major, _ = compute_capability
 
         if not torch.version.cuda or major < 8:
-            self.skipTest("This test requires an NVIDIA GPU with compute capability >= 8.0")
+            self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
 
         for model_class in self.all_model_classes:
             if not model_class._supports_sdpa:
@@ -4060,6 +3970,9 @@ class ModelTesterMixin:
     @require_torch_sdpa
     @slow
     def test_eager_matches_sdpa_generate(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         max_new_tokens = 30
 
         if len(self.all_generative_model_classes) == 0:
@@ -4130,6 +4043,9 @@ class ModelTesterMixin:
 
     @require_torch_sdpa
     def test_sdpa_matches_eager_sliding_window(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         WINDOW_ATTENTION_MODELS = ["mistral", "mixtral", "qwen2", "qwen_moe", "starcoder2"]
 
         if len(self.all_generative_model_classes) == 0:
@@ -4184,6 +4100,9 @@ class ModelTesterMixin:
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_use_cache(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         max_new_tokens = 30
 
         for model_class in self.all_generative_model_classes:
@@ -4229,6 +4148,9 @@ class ModelTesterMixin:
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_fp32_ln(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -4284,8 +4206,7 @@ class ModelTesterMixin:
 
             tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
             if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have this model in TF version yet
-                return
+                self.skipTest(reason="transformers does not have this model in TF version yet")
 
             tf_model_class = getattr(transformers, tf_model_class_name)
 
@@ -4309,8 +4230,7 @@ class ModelTesterMixin:
 
             flax_model_class_name = "Flax" + model_class.__name__  # Add the "Flax at the beginning
             if not hasattr(transformers, flax_model_class_name):
-                # transformers does not have this model in Flax version yet
-                return
+                self.skipTest(reason="transformers does not have this model in Flax version yet")
 
             flax_model_class = getattr(transformers, flax_model_class_name)
 
@@ -4331,6 +4251,9 @@ class ModelTesterMixin:
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_from_config(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
                 self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
@@ -4407,8 +4330,13 @@ class ModelTesterMixin:
         return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix
 
     def test_custom_4d_attention_mask(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
         if len(self.all_generative_model_classes) == 0:
-            self.skipTest("Model architecture has no generative classes, and thus not necessarily supporting 4D masks")
+            self.skipTest(
+                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
+            )
 
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
@@ -4453,7 +4381,7 @@ class ModelTesterMixin:
     @require_read_token
     def test_torch_compile(self):
         if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest("This test requires torch >= 2.3 to run.")
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
 
         if not hasattr(self, "_torch_compile_test_ckpt"):
             self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 4c760715743..f5b30d50339 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -1354,7 +1354,7 @@ class ModelUtilsTest(TestCasePlus):
             self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out)
 
         if not is_torchdynamo_available():
-            return
+            self.skipTest(reason="torchdynamo is not available")
         with self.subTest("Ensure that the warning code is skipped when compiling with torchdynamo."):
             logger.warning_once.cache_clear()
             from torch._dynamo import config, testing
@@ -1631,7 +1631,7 @@ class ModelOnTheFlyConversionTester(unittest.TestCase):
             self.assertEqual(discussion.author, "SFconvertbot")
             self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
 
-    @unittest.skip("Edge case, should work once the Space is updated`")
+    @unittest.skip(reason="Edge case, should work once the Space is updated`")
     def test_safetensors_on_the_fly_wrong_user_opened_pr(self):
         config = BertConfig(
             vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -1760,7 +1760,7 @@ class ModelPushToHubTester(unittest.TestCase):
         except HTTPError:
             pass
 
-    @unittest.skip("This test is flaky")
+    @unittest.skip(reason="This test is flaky")
     def test_push_to_hub(self):
         config = BertConfig(
             vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -1800,7 +1800,7 @@ The commit description supports markdown synthax see:
         )
         self.assertEqual(commit_details.commit_description, COMMIT_DESCRIPTION)
 
-    @unittest.skip("This test is flaky")
+    @unittest.skip(reason="This test is flaky")
     def test_push_to_hub_in_organization(self):
         config = BertConfig(
             vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
@@ -2197,7 +2197,7 @@ class TestAttentionImplementation(unittest.TestCase):
 
     def test_not_available_flash(self):
         if is_flash_attn_2_available():
-            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+            self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")
 
         with self.assertRaises(ImportError) as cm:
             _ = AutoModel.from_pretrained(
@@ -2208,7 +2208,7 @@ class TestAttentionImplementation(unittest.TestCase):
 
     def test_not_available_flash_with_config(self):
         if is_flash_attn_2_available():
-            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+            self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")
 
         config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-GPTBigCodeModel")
 
@@ -2223,7 +2223,7 @@ class TestAttentionImplementation(unittest.TestCase):
 
     def test_not_available_sdpa(self):
         if is_torch_sdpa_available():
-            self.skipTest("This test requires torch<=2.0")
+            self.skipTest(reason="This test requires torch<=2.0")
 
         with self.assertRaises(ImportError) as cm:
             _ = AutoModel.from_pretrained(
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index e7c678cffb7..f2292510f71 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -248,7 +248,7 @@ class PipelineTesterMixin:
                     f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
                     f"processor from `{repo_id}` with `{processor_name}`."
                 )
-                return
+                self.skipTest(f"Could not load the processor from {repo_id} with {processor_name}.")
 
         # TODO: Maybe not upload such problematic tiny models to Hub.
         if tokenizer is None and processor is None:
@@ -256,7 +256,7 @@ class PipelineTesterMixin:
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
                 f"any tokenizer / processor from `{repo_id}`."
             )
-            return
+            self.skipTest(f"Could not find or load any tokenizer / processor from {repo_id}.")
 
         # TODO: We should check if a model file is on the Hub repo. instead.
         try:
@@ -266,7 +266,7 @@ class PipelineTesterMixin:
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
                 f"the model from `{repo_id}` with `{model_architecture}`."
             )
-            return
+            self.skipTest(f"Could not find or load the model from {repo_id} with {model_architecture}.")
 
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
         if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
@@ -275,7 +275,9 @@ class PipelineTesterMixin:
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                 f"`{tokenizer_name}` | processor `{processor_name}`."
             )
-            return
+            self.skipTest(
+                f"Test is known to fail for: model `{model_architecture.__name__}` | tokenizer `{tokenizer_name}` | processor `{processor_name}`."
+            )
 
         # validate
         validate_test_components(self, task, model, tokenizer, processor)
@@ -295,7 +297,7 @@ class PipelineTesterMixin:
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not get the "
                 "pipeline for testing."
             )
-            return
+            self.skipTest(reason="Could not get the pipeline for testing.")
 
         task_test.run_pipeline_test(pipeline, examples)
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 682565256b8..60ed58c4005 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -417,7 +417,7 @@ class TokenizerTesterMixin:
     def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
         """Test ``_tokenize`` and ``convert_tokens_to_string``."""
         if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")
 
         tokenizer = self.get_tokenizer()
         text = "This is text to test the tokenizer."
@@ -449,7 +449,7 @@ class TokenizerTesterMixin:
 
     def test_sentencepiece_tokenize_and_decode(self):
         if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")
 
         text = "This is text to test the tokenizer."
         if self.test_rust_tokenizer:
@@ -466,7 +466,7 @@ class TokenizerTesterMixin:
 
     def test_subword_regularization_tokenizer(self) -> None:
         if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")
 
         # Subword regularization is only available for the slow tokenizer.
         sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
@@ -484,7 +484,7 @@ class TokenizerTesterMixin:
 
     def test_pickle_subword_regularization_tokenizer(self) -> None:
         if not self.test_sentencepiece:
-            return
+            self.skipTest(reason="test_sentencepiece is set to False")
 
         """Google pickle __getstate__ __setstate__ if you are struggling with this."""
         # Subword regularization is only available for the slow tokenizer.
@@ -506,7 +506,7 @@ class TokenizerTesterMixin:
 
     def test_save_sentencepiece_tokenizer(self) -> None:
         if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
         # We want to verify that we will be able to save the tokenizer even if the original files that were used to
         # build the tokenizer have been deleted in the meantime.
         text = "This is text to test the tokenizer."
@@ -545,7 +545,7 @@ class TokenizerTesterMixin:
 
     def test_rust_tokenizer_signature(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         signature = inspect.signature(self.rust_tokenizer_class.__init__)
 
@@ -554,7 +554,7 @@ class TokenizerTesterMixin:
 
     def test_tokenizer_slow_store_full_signature(self):
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         signature = inspect.signature(self.tokenizer_class.__init__)
         tokenizer = self.get_tokenizer()
@@ -565,7 +565,7 @@ class TokenizerTesterMixin:
 
     def test_tokenizer_fast_store_full_signature(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         signature = inspect.signature(self.rust_tokenizer_class.__init__)
         tokenizer = self.get_rust_tokenizer()
@@ -580,11 +580,11 @@ class TokenizerTesterMixin:
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
@@ -1973,7 +1973,7 @@ class TokenizerTesterMixin:
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 else:
                     empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
                     normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
@@ -2007,9 +2007,9 @@ class TokenizerTesterMixin:
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
+                    self.skipTest(reason="No padding token.")
                 if "attention_mask" not in tokenizer.model_input_names:
-                    self.skipTest("This model does not use attention mask.")
+                    self.skipTest(reason="This model does not use attention mask.")
 
                 features = [
                     {"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
@@ -2126,7 +2126,7 @@ class TokenizerTesterMixin:
 
     def test_padding_warning_message_fast_tokenizer(self):
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         sequence = "This is a text"
 
@@ -2146,7 +2146,7 @@ class TokenizerTesterMixin:
         )
 
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         tokenizer_slow = self.get_tokenizer()
         # check correct behaviour if no pad_token_id exists and add it eventually
@@ -2295,8 +2295,8 @@ class TokenizerTesterMixin:
     @require_tokenizers
     def test_added_token_are_matched_longest_first(self):
         if not self.test_slow_tokenizer:
-            self.skipTest("This test is only for slow tokenizers")
-            return
+            self.skipTest(reason="This test is only for slow tokenizers")
+
         tokenizers = self.get_tokenizers(fast=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2305,7 +2305,7 @@ class TokenizerTesterMixin:
                     tokenizer.add_tokens([AddedToken("extra_id_100")])
                 except Exception:
                     # Canine cannot add tokens which are not codepoints
-                    self.skipTest("Cannot add those Added tokens")
+                    self.skipTest(reason="Cannot add those Added tokens")
 
                 # XXX: This used to split on `extra_id_1` first we're matching
                 # longest first now.
@@ -2588,13 +2588,13 @@ class TokenizerTesterMixin:
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
 
                 model = model_class(config)
 
@@ -2637,13 +2637,13 @@ class TokenizerTesterMixin:
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
 
                 model = model_class(config)
 
@@ -2672,13 +2672,13 @@ class TokenizerTesterMixin:
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
+                    self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
 
                 config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
                 config = config_class()
 
                 if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
+                    self.skip("Model is not an encoder-decoder model or has no set pad token id")
 
                 # Build sequence
                 first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
@@ -2712,7 +2712,7 @@ class TokenizerTesterMixin:
     @require_torch
     def test_prepare_seq2seq_batch(self):
         if not self.test_seq2seq:
-            return
+            self.skipTest(reason="test_seq2seq is set to False")
 
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -2740,7 +2740,7 @@ class TokenizerTesterMixin:
                         src_lang="en_XX",  # this should be ignored (for all but mbart) but not cause an error
                     )
                 except NotImplementedError:
-                    return
+                    self.skipTest(reason="Encountered NotImplementedError calling prepare_seq2seq_batch")
                 self.assertEqual(batch.input_ids.shape[1], 3)
                 self.assertEqual(batch.labels.shape[1], 10)
                 # max_target_length will default to max_length if not specified
@@ -3008,7 +3008,7 @@ class TokenizerTesterMixin:
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3049,7 +3049,7 @@ class TokenizerTesterMixin:
     def test_num_special_tokens_to_add_equal(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3067,7 +3067,7 @@ class TokenizerTesterMixin:
     def test_max_length_equal(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3081,7 +3081,7 @@ class TokenizerTesterMixin:
     def test_special_tokens_map_equal(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3177,10 +3177,10 @@ class TokenizerTesterMixin:
                 elif is_flax_available():
                     returned_tensor = "jax"
                 else:
-                    return
+                    self.skipTest(reason="No expected framework from PT, TF or JAX found")
 
                 if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
-                    return
+                    self.skipTest(reason="This tokenizer has no padding token set, or pad_token_id < 0")
 
                 tokens = tokenizer.encode_plus(
                     "HuggingFace is solving NLP one commit at a time",
@@ -3225,7 +3225,7 @@ class TokenizerTesterMixin:
     def test_compare_pretokenized_inputs(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3307,7 +3307,7 @@ class TokenizerTesterMixin:
     def test_create_token_type_ids(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3329,7 +3329,7 @@ class TokenizerTesterMixin:
     def test_build_inputs_with_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3374,7 +3374,7 @@ class TokenizerTesterMixin:
     def test_padding(self, max_length=50):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3600,7 +3600,7 @@ class TokenizerTesterMixin:
     def test_padding_different_model_input_name(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3638,7 +3638,7 @@ class TokenizerTesterMixin:
     def test_save_pretrained(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3713,7 +3713,7 @@ class TokenizerTesterMixin:
     def test_embeded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3781,7 +3781,7 @@ class TokenizerTesterMixin:
     def test_compare_prepare_for_model(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3884,7 +3884,7 @@ class TokenizerTesterMixin:
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -3919,7 +3919,7 @@ class TokenizerTesterMixin:
     def test_training_new_tokenizer_with_special_tokens_change(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
-            return
+            self.skipTest(reason="test_rust_tokenizer is set to False")
 
         tokenizer = self.get_rust_tokenizer()
         # Test with a special tokens map
@@ -4092,7 +4092,7 @@ class TokenizerTesterMixin:
     def test_save_slow_from_fast_and_reload_fast(self):
         if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
             # we need both slow and fast versions
-            return
+            self.skipTest(reason="test_rust_tokenizer or test_slow_tokenizer is set to False")
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -4166,7 +4166,7 @@ class TokenizerTesterMixin:
 
     def test_split_special_tokens(self):
         if not self.test_slow_tokenizer:
-            return
+            self.skipTest(reason="test_slow_tokenizer is set to False")
         # Tests the expected appearance (or absence) of special token in encoded output,
         # explicit values are not tested because tokenization is model dependent and can change
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
index 894567d93df..5b2f2021565 100644
--- a/tests/test_tokenization_utils.py
+++ b/tests/test_tokenization_utils.py
@@ -96,7 +96,7 @@ class TokenizerUtilTester(unittest.TestCase):
         # the current folder and have the right name.
         if os.path.isfile("tokenizer.json"):
             # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
-            return
+            self.skipTest(reason="Skipping test as there is a `tokenizer.json` file in the current folder.")
         try:
             with open("tokenizer.json", "wb") as f:
                 http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index ac073529e25..d5c6444de4e 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -47,9 +47,10 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
         tokenizer.save_pretrained(self.tmpdirname)
 
+    @unittest.skip(
+        "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
+    )
     def test_tokenizer_mismatch_warning(self):
-        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
-        # model
         pass
 
     @unittest.skip(
@@ -70,13 +71,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
     def test_additional_special_tokens_serialization(self):
         pass
 
+    @unittest.skip(reason="PreTrainedTokenizerFast is the only tokenizer that is not linked to any model")
     def test_prepare_for_model(self):
-        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
-        # model
         pass
 
+    @unittest.skip(reason="PreTrainedTokenizerFast doesn't have tokenizer_file in its signature")
     def test_rust_tokenizer_signature(self):
-        # PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
         pass
 
     def test_training_new_tokenizer(self):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index dba4b97515f..2c6793e39ca 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3066,7 +3066,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
         torchdynamo.reset()
 
-    @unittest.skip("torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
+    @unittest.skip(reason="torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
     @require_torch_non_multi_gpu
     @require_torchdynamo
     def test_torchdynamo_memory(self):
@@ -3668,7 +3668,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
     def test_push_to_hub_with_saves_each_n_steps(self):
         num_gpus = max(1, backend_device_count(torch_device))
         if num_gpus > 2:
-            return
+            self.skipTest(reason="More than 2 GPUs available")
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = get_regression_trainer(
diff --git a/tests/utils/test_doc_samples.py b/tests/utils/test_doc_samples.py
index d5c5ac21aa5..96251aeed00 100644
--- a/tests/utils/test_doc_samples.py
+++ b/tests/utils/test_doc_samples.py
@@ -26,7 +26,7 @@ from transformers.testing_utils import require_tf, require_torch, slow
 logger = logging.getLogger()
 
 
-@unittest.skip("Temporarily disable the doc tests.")
+@unittest.skip(reason="Temporarily disable the doc tests.")
 @require_torch
 @require_tf
 @slow
diff --git a/tests/utils/test_model_output.py b/tests/utils/test_model_output.py
index b1fc3dd0131..b484c0e1694 100644
--- a/tests/utils/test_model_output.py
+++ b/tests/utils/test_model_output.py
@@ -159,11 +159,11 @@ class ModelOutputTester(unittest.TestCase):
             )
 
     # TODO: @ydshieh
-    @unittest.skip("CPU OOM")
+    @unittest.skip(reason="CPU OOM")
     @require_torch
     def test_export_serialization(self):
         if not is_torch_greater_or_equal_than_2_2:
-            return
+            self.skipTest(reason="Export serialization requires torch >= 2.2.0")
 
         model_cls = AlbertForMaskedLM
         model_config = model_cls.config_class()