CI: avoid human error, automatically infer generative models (#33212)

* tmp commit * move tests to the right class * remove ALL all_generative_model_classes = ... * skip tf roberta * skip InstructBlipForConditionalGenerationDecoderOnlyTest * videollava * reduce diff * reduce diff * remove on vlms * fix a few more * manual rebase bits * more manual rebase * remove all manual generative model class test entries * fix up to ernie * a few more removals * handle remaining cases * recurrent gemma * it's better here * make fixup * tf idefics is broken * tf bert + generate is broken * don't touch tf :() * don't touch tf :( * make fixup * better comments for test skips * revert tf changes * remove empty line removal * one more * missing one
2025-07-04 05:10:06 +06:00 · 2025-02-13 15:27:11 +00:00 · 2025-02-13 15:27:11 +00:00 · 62c7ea0201
commit 62c7ea0201
parent 06231fdfc7
162 changed files with 50 additions and 198 deletions
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@ -1507,6 +1507,14 @@ class BertForMaskedLM(BertPreTrainedModel):

        return {"input_ids": input_ids, "attention_mask": attention_mask}

+    @classmethod
+    def can_generate(cls) -> bool:
+        """
+        Legacy correction: BertForMaskedLM can't call `generate()` from GenerationMixin.
+        Remove after v4.50, when we stop making `PreTrainedModel` inherit from `GenerationMixin`.
+        """
+        return False
+

@add_start_docstrings(
    """Bert Model with a `next sentence prediction (classification)` head on top.""",
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@ -1325,6 +1325,14 @@ class ErnieForMaskedLM(ErniePreTrainedModel):

        return {"input_ids": input_ids, "attention_mask": attention_mask}

+    @classmethod
+    def can_generate(cls) -> bool:
+        """
+        Legacy correction: ErnieForMaskedLM can't call `generate()` from GenerationMixin.
+        Remove after v4.50, when we stop making `PreTrainedModel` inherit from `GenerationMixin`.
+        """
+        return False
+

@add_start_docstrings(
    """Ernie Model with a `next sentence prediction (classification)` head on top.""",
--- a/tests/generation/test_flax_utils.py
+++ b/tests/generation/test_flax_utils.py
@ -66,7 +66,6 @@ def random_attention_mask(shape, rng=None):
@require_flax
 class FlaxGenerationTesterMixin:
    model_tester = None
-    all_generative_model_classes = ()

    def _get_input_ids_and_config(self):
        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@ -28,7 +28,7 @@ import pytest
 from packaging import version
 from parameterized import parameterized

-from transformers import AutoConfig, is_torch_available, pipeline
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, is_torch_available, pipeline
 from transformers.testing_utils import (
    is_flaky,
    require_accelerate,
@ -61,8 +61,6 @@ if is_torch_available():
        AutoModelForSeq2SeqLM,
        AutoModelForSpeechSeq2Seq,
        AutoModelForVision2Seq,
-        AutoProcessor,
-        AutoTokenizer,
        BartForConditionalGeneration,
        BartTokenizer,
        GPT2LMHeadModel,
@ -119,7 +117,6 @@ from transformers.utils import is_sklearn_available
 class GenerationTesterMixin:
    input_name = "input_ids"
    model_tester = None
-    all_generative_model_classes = ()
    max_new_tokens = 3

    def prepare_config_and_inputs_for_generate(self, batch_size=2):
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@ -189,7 +189,6 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
    """

    all_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False
    _is_composite = True
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@ -205,7 +205,6 @@ class AutoformerModelTester:
@require_torch
 class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
-    all_generative_model_classes = (AutoformerForPrediction,) if is_torch_available() else ()
    pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
    test_pruning = False
    test_head_masking = False
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@ -257,15 +257,7 @@ class BambaModelTester:

@require_torch
 class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BambaModel,
-            BambaForCausalLM,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (BambaForCausalLM,) if is_torch_available() else ()
+    all_model_classes = (BambaModel, BambaForCausalLM) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BambaModel,
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@ -22,6 +22,7 @@ import unittest
 import pytest

 from transformers import (
+    BarkCausalModel,
    BarkCoarseConfig,
    BarkConfig,
    BarkFineConfig,
@ -53,7 +54,6 @@ if is_torch_available():
    import torch

    from transformers import (
-        BarkCausalModel,
        BarkCoarseModel,
        BarkFineModel,
        BarkModel,
@ -527,6 +527,8 @@ class BarkModelTester:
@require_torch
 class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (BarkSemanticModel,) if is_torch_available() else ()
+    # `BarkSemanticModel` inherits from `BarkCausalModel`, but requires an advanced generation config.
+    # `BarkCausalModel` does not, so we run generation tests there.
    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()

    is_encoder_decoder = False
@ -614,8 +616,9 @@ class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te

@require_torch
 class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    # Same tester as BarkSemanticModelTest, except for model_class and config_class
    all_model_classes = (BarkCoarseModel,) if is_torch_available() else ()
+    # `BarkCoarseModel` inherits from `BarkCausalModel`, but requires an advanced generation config.
+    # `BarkCausalModel` does not, so we run generation tests there.
    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()

    is_encoder_decoder = False
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@ -419,7 +419,6 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (BartForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BartModel,
@ -1502,7 +1501,6 @@ class BartStandaloneDecoderModelTester:
@require_torch
 class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (BartDecoder, BartForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (BartForCausalLM,) if is_torch_available() else ()
    fx_comptatible = True
    test_pruning = False
    is_encoder_decoder = False
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ b/tests/models/bart/test_modeling_flax_bart.py
@ -336,7 +336,6 @@ class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationT
        if is_flax_available()
        else ()
    )
-    all_generative_model_classes = (FlaxBartForConditionalGeneration,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxBartModelTester(self)
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@ -451,7 +451,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (BertLMHeadModel,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BertModel,
--- a/tests/models/bert_generation/test_modeling_bert_generation.py
+++ b/tests/models/bert_generation/test_modeling_bert_generation.py
@ -243,7 +243,6 @@ class BertGenerationEncoderTester:
@require_torch
 class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else ()
-    all_generative_model_classes = (BertGenerationDecoder,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"feature-extraction": BertGenerationEncoder, "text-generation": BertGenerationDecoder}
        if is_torch_available()
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@ -451,7 +451,6 @@ class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (BigBirdForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BigBirdModel,
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@ -250,7 +250,6 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (BigBirdPegasusForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BigBirdPegasusModel,
@ -792,7 +791,6 @@ class BigBirdPegasusStandaloneDecoderModelTester:
@require_torch
 class BigBirdPegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (BigBirdPegasusDecoder, BigBirdPegasusForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (BigBirdPegasusForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    is_encoder_decoder = False

--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@ -284,7 +284,6 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (BioGptForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BioGptModel,
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@ -226,7 +226,6 @@ class BlenderbotModelTester:
@require_torch
 class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BlenderbotModel,
@ -533,7 +532,6 @@ class BlenderbotStandaloneDecoderModelTester:
@require_torch
 class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (BlenderbotDecoder, BlenderbotForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (BlenderbotForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    is_encoder_decoder = False

--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
@ -319,7 +319,6 @@ class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGener
        if is_flax_available()
        else ()
    )
-    all_generative_model_classes = (FlaxBlenderbotForConditionalGeneration,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxBlenderbotModelTester(self)
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@ -217,7 +217,6 @@ class BlenderbotSmallModelTester:
@require_torch
 class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (BlenderbotSmallForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BlenderbotSmallModel,
@ -542,7 +541,6 @@ class BlenderbotSmallStandaloneDecoderModelTester:
@require_torch
 class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (BlenderbotSmallDecoder, BlenderbotSmallForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (BlenderbotSmallForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    is_encoder_decoder = False

--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@ -318,7 +318,6 @@ class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase, Flax
        if is_flax_available()
        else ()
    )
-    all_generative_model_classes = (FlaxBlenderbotSmallForConditionalGeneration,) if is_flax_available() else ()

    def is_pipeline_test_to_skip(
        self,
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@ -472,7 +472,6 @@ class Blip2ForConditionalGenerationDecoderOnlyModelTester:
@require_torch
 class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
@ -995,6 +994,8 @@ class Blip2ModelTester:
@require_torch
 class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
+    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Blip2Model,
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@ -328,7 +328,6 @@ class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        else ()
    )

-    all_generative_model_classes = (BloomForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": BloomModel,
--- a/tests/models/bloom/test_modeling_flax_bloom.py
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@ -171,7 +171,6 @@ class FlaxBloomModelTester:
@require_flax
 class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
    all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = () if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxBloomModelTester(self)
@ -199,7 +198,6 @@ class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGeneration
@require_flax
 class FlaxBloomGenerationTest(unittest.TestCase):
    all_model_classes = (FlaxBloomForCausalLM,) if is_flax_available() else ()
-    all_generative_model_classes = () if is_flax_available() else ()

    def setUp(self):
        self.model_id = "bigscience/bloom-560m"
--- a/tests/models/bros/test_modeling_bros.py
+++ b/tests/models/bros/test_modeling_bros.py
@ -285,7 +285,6 @@ class BrosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = () if is_torch_available() else ()
    pipeline_model_mapping = (
        {"feature-extraction": BrosModel, "token-classification": BrosForTokenClassification}
        if is_torch_available()
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@ -271,7 +271,6 @@ class ChameleonModelTester:
@require_torch
 class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (ChameleonForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": ChameleonModel,
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@ -281,7 +281,6 @@ class ClvpDecoderTester:
@require_torch
 class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (ClvpForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}

    test_pruning = False
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@ -322,7 +322,6 @@ class CodeGenModelTester:
@require_torch
 class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (CodeGenForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_torch_available() else {}
    )
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@ -274,7 +274,6 @@ class CohereModelTester:
@require_torch
 class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (CohereModel, CohereForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (CohereForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": CohereModel,
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@ -54,7 +54,6 @@ class Cohere2ModelTester(CohereModelTester):
@require_torch
 class Cohere2ModelTest(CohereModelTest, unittest.TestCase):
    all_model_classes = (Cohere2Model, Cohere2ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (Cohere2ForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Cohere2Model,
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@ -193,7 +193,6 @@ class CTRLModelTester:
@require_torch
 class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else ()
-    all_generative_model_classes = (CTRLLMHeadModel,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": CTRLModel,
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@ -373,7 +373,6 @@ class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (Data2VecTextForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Data2VecTextModel,
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@ -322,7 +322,6 @@ class DbrxModelTester:
@require_torch
 class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (DbrxModel, DbrxForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (DbrxForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = {"text-generation": DbrxForCausalLM} if is_torch_available() else {}
    test_headmasking = False
    test_pruning = False
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@ -127,7 +127,6 @@ class DecisionTransformerModelTester:
@require_torch
 class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (DecisionTransformerModel,) if is_torch_available() else ()
-    all_generative_model_classes = ()
    pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_torch_available() else {}

    # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@ -296,7 +296,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (DiffLlamaForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": DiffLlamaModel,
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@ -124,7 +124,6 @@ class Emu3Text2TextModelTester:
@require_torch
 class Emu3Text2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Emu3ForCausalLM,) if is_torch_available() else ()
-    all_generative_model_classes = (Emu3ForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "text-generation": Emu3ForCausalLM,
@ -312,7 +311,6 @@ class Emu3Vision2TextModelTester:
@require_torch
 class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {}
    test_headmasking = False
    test_pruning = False
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@ -442,7 +442,6 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (ErnieForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": ErnieModel,
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@ -195,7 +195,6 @@ class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "feature-extraction": EsmModel,
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@ -169,7 +169,6 @@ class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
    test_mismatched_shapes = False

    all_model_classes = (EsmForProteinFolding,) if is_torch_available() else ()
-    all_generative_model_classes = ()
    pipeline_model_mapping = {} if is_torch_available() else {}
    test_sequence_classification_problem_types = False

--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@ -290,7 +290,6 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (FalconForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": FalconModel,
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@ -247,7 +247,6 @@ class FalconMambaModelTester:
 # Copied from transformers.tests.models.mamba.MambaModelTest with Mamba->Falcon,mamba->falcon_mamba,FalconMambaCache->MambaCache
 class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (FalconMambaModel, FalconMambaForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (FalconMambaForCausalLM,) if is_torch_available() else ()
    has_attentions = False  # FalconMamba does not support attentions
    fx_compatible = False  # FIXME let's try to support this @ArthurZucker
    test_torchscript = False  # FIXME let's try to support this @ArthurZucker
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@ -163,7 +163,6 @@ def prepare_fsmt_inputs_dict(
@require_torch
 class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (FSMTModel, FSMTForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (FSMTForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": FSMTModel,
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@ -268,7 +268,6 @@ class FuyuModelTester:
@require_torch
 class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
-    all_generative_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
    )
--- a/tests/models/gemma/test_modeling_flax_gemma.py
+++ b/tests/models/gemma/test_modeling_flax_gemma.py
@ -176,7 +176,6 @@ class FlaxGemmaModelTester:
@require_flax
 class FlaxGemmaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxGemmaModel, FlaxGemmaForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxGemmaForCausalLM,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxGemmaModelTester(self)
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@ -300,7 +300,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GemmaForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GemmaModel,
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@ -63,7 +63,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase):
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (Gemma2ForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Gemma2Model,
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@ -399,7 +399,6 @@ class GitModelTester:
@require_torch
 class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GitModel,
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@ -286,7 +286,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GlmForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GlmModel,
--- a/tests/models/got_ocr2/test_modeling_got_ocr2.py
+++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py
@ -169,7 +169,6 @@ class GotOcr2VisionText2TextModelTester:
@require_torch
 class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (GotOcr2ForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (GotOcr2ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "image-to-text": GotOcr2ForConditionalGeneration,
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@ -211,7 +211,6 @@ class FlaxGPT2ModelTester:
@require_flax
 class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxGPT2LMHeadModel,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxGPT2ModelTester(self)
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@ -492,7 +492,6 @@ class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GPT2Model,
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@ -390,7 +390,6 @@ class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GPTBigCodeForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GPTBigCodeModel,
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@ -183,7 +183,6 @@ class FlaxGPTNeoModelTester:
@require_flax
 class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxGPTNeoForCausalLM,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxGPTNeoModelTester(self)
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@ -376,7 +376,6 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GPTNeoForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GPTNeoModel,
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@ -274,7 +274,6 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GPTNeoXForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GPTNeoXModel,
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@ -198,7 +198,6 @@ class GPTNeoXJapaneseModelTester:
@require_torch
 class GPTNeoXModelJapaneseTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (GPTNeoXJapaneseForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"feature-extraction": GPTNeoXJapaneseModel, "text-generation": GPTNeoXJapaneseForCausalLM}
        if is_torch_available()
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@ -180,7 +180,6 @@ class FlaxGPTJModelTester:
@require_flax
 class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxGPTJForCausalLM,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxGPTJModelTester(self)
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@ -341,7 +341,6 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GPTJForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GPTJModel,
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@ -281,7 +281,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GraniteForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GraniteModel,
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@ -280,7 +280,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (GraniteMoeForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": GraniteMoeModel,
--- a/tests/models/helium/test_modeling_helium.py
+++ b/tests/models/helium/test_modeling_helium.py
@ -55,7 +55,6 @@ class HeliumModelTest(GemmaModelTest, unittest.TestCase):
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (HeliumForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": HeliumModel,
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@ -593,7 +593,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
@require_torch
 class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else ()
-    all_generative_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else ()

    def setUp(self):
        self.model_tester = IdeficsModelTester(
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@ -369,7 +369,6 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
    """

    all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
    fx_compatible = False
    test_pruning = False
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@ -327,7 +327,6 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
    """

    all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
    fx_compatible = False
    test_pruning = False
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@ -230,7 +230,6 @@ class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
    all_model_classes = (
        (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else ()
    )
-    all_generative_model_classes = (ImageGPTForCausalImageModeling,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"image-feature-extraction": ImageGPTModel, "image-classification": ImageGPTForImageClassification}
        if is_torch_available()
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@ -190,7 +190,6 @@ class InformerModelTester:
@require_torch
 class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
-    all_generative_model_classes = (InformerForPrediction,) if is_torch_available() else ()
    pipeline_model_mapping = {"feature-extraction": InformerModel} if is_torch_available() else {}
    is_encoder_decoder = True
    test_pruning = False
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@ -477,7 +477,6 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
@require_torch
 class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
    fx_compatible = False
    test_head_masking = False
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@ -496,7 +496,6 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
 ):
    all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
    fx_compatible = False
    test_head_masking = False
    test_pruning = False
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@ -327,7 +327,6 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (JambaForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": JambaModel,
--- a/tests/models/jetmoe/test_modeling_jetmoe.py
+++ b/tests/models/jetmoe/test_modeling_jetmoe.py
@ -280,7 +280,6 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
    all_model_classes = (
        (JetMoeModel, JetMoeForCausalLM, JetMoeForSequenceClassification) if is_torch_available() else ()
    )
-    all_generative_model_classes = (JetMoeForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": JetMoeModel,
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@ -259,7 +259,6 @@ class Kosmos2ModelTester:
@require_torch
 class Kosmos2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": Kosmos2Model,
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@ -281,7 +281,6 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (LEDForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": LEDModel,
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@ -176,7 +176,6 @@ class FlaxLlamaModelTester:
@require_flax
 class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxLlamaForCausalLM,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxLlamaModelTester(self)
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@ -289,7 +289,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (LlamaForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": LlamaModel,
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@ -181,7 +181,6 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
    """

    all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
        if is_torch_available()
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@ -215,7 +215,6 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
    """

    all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
    test_pruning = False
    test_head_masking = False
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@ -231,7 +231,6 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
    """

    all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False
    _is_composite = True
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@ -215,7 +215,6 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
    """

    all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
    )
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@ -237,7 +237,6 @@ class FlaxLongT5ModelTester:
@require_flax
 class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxLongT5ForConditionalGeneration,) if is_flax_available() else ()
    is_encoder_decoder = True

    def setUp(self):
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@ -502,7 +502,6 @@ class LongT5ModelTester:
@require_torch
 class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (LongT5Model, LongT5ForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (LongT5ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": LongT5Model,
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@ -240,7 +240,6 @@ class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (M2M100ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": M2M100Model,
--- a/tests/models/mamba/test_modeling_mamba.py
+++ b/tests/models/mamba/test_modeling_mamba.py
@ -239,7 +239,6 @@ class MambaModelTester:
@require_torch
 class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MambaModel, MambaForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (MambaForCausalLM,) if is_torch_available() else ()
    has_attentions = False  # Mamba does not support attentions
    fx_compatible = False  # FIXME let's try to support this @ArthurZucker
    test_torchscript = False  # FIXME let's try to support this @ArthurZucker
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@ -214,7 +214,6 @@ class Mamba2ModelTester:
@require_torch
 class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (Mamba2Model, Mamba2ForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (Mamba2ForCausalLM,) if is_torch_available() else ()
    has_attentions = False  # Mamba does not support attentions
    fx_compatible = False  # FIXME let's try to support this @molbap
    test_torchscript = False  # FIXME I think this should be doable @molbap @ArthurZucker
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ b/tests/models/marian/test_modeling_flax_marian.py
@ -231,7 +231,6 @@ class FlaxMarianModelTester:
 class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
    is_encoder_decoder = True
    all_model_classes = (FlaxMarianModel, FlaxMarianMTModel) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxMarianMTModel,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxMarianModelTester(self)
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@ -237,7 +237,6 @@ class MarianModelTester:
@require_torch
 class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MarianModel, MarianMTModel) if is_torch_available() else ()
-    all_generative_model_classes = (MarianMTModel,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MarianModel,
@ -871,7 +870,6 @@ class MarianStandaloneDecoderModelTester:
@require_torch
 class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (MarianDecoder, MarianForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (MarianForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    is_encoder_decoder = False

--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@ -342,7 +342,6 @@ class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGeneration
        if is_flax_available()
        else ()
    )
-    all_generative_model_classes = (FlaxMBartForConditionalGeneration,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxMBartModelTester(self)
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@ -229,7 +229,6 @@ class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (MBartForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MBartModel,
@ -727,7 +726,6 @@ class MBartStandaloneDecoderModelTester:
@require_torch
 class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (MBartDecoder, MBartForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (MBartForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    is_encoder_decoder = False

--- a/tests/models/mistral/test_modeling_flax_mistral.py
+++ b/tests/models/mistral/test_modeling_flax_mistral.py
@ -187,7 +187,6 @@ class FlaxMistralModelTester:
@require_flax
 class FlaxMistralModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
    all_model_classes = (FlaxMistralModel, FlaxMistralForCausalLM) if is_flax_available() else ()
-    all_generative_model_classes = (FlaxMistralForCausalLM,) if is_flax_available() else ()

    def setUp(self):
        self.model_tester = FlaxMistralModelTester(self)
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@ -301,7 +301,6 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (MistralForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MistralModel,
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@ -300,7 +300,6 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (MixtralForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MixtralModel,
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@ -124,7 +124,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
    """

    all_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
-    all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False

@ -264,7 +263,6 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
    """

    all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
    test_pruning = False
    test_head_masking = False
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@ -229,7 +229,6 @@ class ModernBertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "feature-extraction": ModernBertModel,
--- a/tests/models/moonshine/test_modeling_moonshine.py
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@ -170,6 +170,8 @@ class MoonshineModelTester:
@require_torch
 class MoonshineModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MoonshineModel, MoonshineForConditionalGeneration) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO (eustache): remove this line and then make CI green
+    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "automatic-speech-recognition": MoonshineForConditionalGeneration,
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@ -152,9 +152,6 @@ class MoshiDecoderTester:
@require_torch
 class MoshiDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MoshiModel, MoshiForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (
-        (MoshiForCausalLM,) if is_torch_available() else ()
-    )  # we don't want to run all the generation tests, only a specific subset
    test_pruning = False
    test_resize_embeddings = True
    test_head_masking = False
@ -528,7 +525,6 @@ class MoshiTester:
@require_torch
 class MoshiTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (MoshiForConditionalGeneration,) if is_torch_available() else ()
-    all_generative_model_classes = (MoshiForConditionalGeneration,) if is_torch_available() else ()
    test_pruning = False  # training is not supported yet for Moshi
    test_headmasking = False
    test_resize_embeddings = False
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@ -354,7 +354,6 @@ class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        else ()
    )

-    all_generative_model_classes = (MptForCausalLM,) if is_torch_available() else ()
    fx_compatible = False
    test_missing_keys = False
    test_pruning = False
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@ -298,7 +298,6 @@ class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    test_torchscript = False
    has_attentions = False

-    all_generative_model_classes = ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MraModel,
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@ -553,7 +553,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (MT5ForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MT5Model,
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@ -176,6 +176,8 @@ class MusicgenDecoderTester:
@require_torch
 class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_torch_available() else ()
+    # Doesn't run generation tests. See `greedy_sample_model_classes` below
+    all_generative_model_classes = ()
    greedy_sample_model_classes = (
        (MusicgenForCausalLM,) if is_torch_available() else ()
    )  # we don't want to run all the generation tests, only a specific subset
@ -801,6 +803,8 @@ class MusicgenTester:
@require_torch
 class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. See `greedy_sample_model_classes` below
+    all_generative_model_classes = ()
    greedy_sample_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"text-to-audio": MusicgenForConditionalGeneration} if is_torch_available() else {}
    test_pruning = False  # training is not supported yet for MusicGen
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@ -182,6 +182,8 @@ class MusicgenMelodyDecoderTester:
@require_torch
 class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (MusicgenMelodyModel, MusicgenMelodyForCausalLM) if is_torch_available() else ()
+    # Doesn't run generation tests. See `greedy_sample_model_classes` below
+    all_generative_model_classes = ()
    greedy_sample_model_classes = (
        (MusicgenMelodyForCausalLM,) if is_torch_available() else ()
    )  # the model uses a custom generation method so we only run a specific subset of the generation tests
@ -820,6 +822,8 @@ class MusicgenMelodyTester:
 # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenTest with Musicgen->MusicgenMelody, musicgen->musicgen_melody, EncoderDecoder->DecoderOnly, input_values->input_features
 class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. See `greedy_sample_model_classes` below
+    all_generative_model_classes = ()
    greedy_sample_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = {"text-to-audio": MusicgenMelodyForConditionalGeneration} if is_torch_available() else {}
    test_pruning = False  # training is not supported yet for MusicGen
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@ -418,7 +418,6 @@ class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (MvpForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": MvpModel,
@ -803,7 +802,6 @@ class MvpStandaloneDecoderModelTester:
@require_torch
 class MvpStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
    all_model_classes = (MvpDecoder, MvpForCausalLM) if is_torch_available() else ()
-    all_generative_model_classes = (MvpForCausalLM,) if is_torch_available() else ()
    fx_comptatible = True
    test_pruning = False
    is_encoder_decoder = False
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@ -75,7 +75,6 @@ class NemotronModelTest(GemmaModelTest):
        if is_torch_available()
        else ()
    )
-    all_generative_model_classes = (NemotronForCausalLM,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": NemotronModel,
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@ -247,7 +247,6 @@ class NllbMoeModelTester:
@require_torch
 class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (NllbMoeModel, NllbMoeForConditionalGeneration) if is_torch_available() else ()
-    all_generative_model_classes = (NllbMoeForConditionalGeneration,) if is_torch_available() else ()
    pipeline_model_mapping = (
        {
            "feature-extraction": NllbMoeModel,
--- a/Show More
+++ b/Show More