Merge 9da67e460b into ebfbcd42da

2025-07-03 12:50:06 +06:00 · 2025-07-02 23:43:59 +02:00 · 2025-07-02 23:43:59 +02:00 · f18157c6b9
commit f18157c6b9
parent ebfbcd42da 9da67e460b
2 changed files with 16 additions and 2 deletions
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@ -225,6 +225,7 @@ class MllamaVisionAttention(nn.Module):
        self.head_dim = config.hidden_size // config.attention_heads
        self.scaling = self.head_dim**-0.5
        self.num_key_value_groups = 1
+        self.is_causal = False

        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=False)
@ -584,6 +585,7 @@ class MllamaTextSelfAttention(nn.Module):
        self.scaling = self.head_dim**-0.5
        self.rope_theta = config.rope_theta
        self.layer_idx = layer_idx
+        self.is_causal = True

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
@ -1028,6 +1030,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
 class MllamaVisionModel(MllamaPreTrainedModel):
    config_class = MllamaVisionConfig
    base_model_prefix = "vision_model"
+    _supports_flash_attn_2 = False  # the vision model always adds a 4D attn mask which is not supported by FA2

    def __init__(self, config: MllamaVisionConfig):
        super().__init__(config)
@ -1617,6 +1620,7 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin):
 class MllamaModel(MllamaPreTrainedModel):
    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
+    _supports_flash_attn_2 = False  # the vision model does not support FA2

    def __init__(self, config: MllamaConfig):
        super().__init__(config)
@ -1778,6 +1782,7 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
    }
    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
    _tied_weights_keys = ["lm_head.weight"]
+    _supports_flash_attn_2 = False  # the vision model does not support FA2

    def __init__(self, config: MllamaConfig):
        super().__init__(config)
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@ -535,6 +535,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.",
                    ("cuda", 8): 'If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.',
+                    ("rocm", (9, 5)): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
                }
            )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -582,6 +583,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
                    ("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy tapestry\n Threads of joy and sorrow\nWeft of memories",
                    ("cuda", 7): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
                    ("cuda", 8): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
+                    ("rocm", (9, 5)): "If I had to write a haiku about my cat, I would write:\nWhiskers twitching bright\nMoonlight dancing on her fur\nFurry little",
                }
            )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -621,6 +623,8 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
                ("xpu", 3): torch.tensor([9.1562, 8.9141, 5.0664, 1.6855, 3.2324], dtype=actual_logits.dtype),
                ("cuda", 7): torch.tensor([9.0781, 8.8750, 5.0781, 1.6221, 3.2207], dtype=actual_logits.dtype),
                ("cuda", 8): torch.tensor([9.0703, 8.8750, 5.0781, 1.6279, 3.2207], dtype=actual_logits.dtype),
+                # NOTE: rocm logits are quite a bit off, we should investigate. Generation makes sense though.
+                ("rocm", (9, 5)): torch.tensor([9.3359, 9.1641, 5.3867, 2.2090, 3.3379], dtype=actual_logits.dtype),
            }
        )

@ -666,6 +670,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
                    ("cuda", 8): 'If I had to write a haiku for this one, it would be:.\\nA dock in the lake.\\nA mountain in the distance.\\nA long exposure.',
+                    ("rocm", (9, 5)): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
                 }
            )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -683,6 +688,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
                    ("xpu", 3): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
                    ("cuda", 7): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
                    ("cuda", 8): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
+                    ("rocm", (9, 5)): "This image shows\nThe image depicts a person named I'm not able to provide that information. I'm not able to provide that information.",
                }
            )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -743,9 +749,12 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        generated_output = output[0][prompt_len:]
        decoded_output = processor.decode(generated_output, skip_special_tokens=False)

-        # model should response about "stop sign", however it responses about "dock"
+        # On NVIDIA, the model should response about "stop sign", however it responses about "dock"
        # this happens only in quantized version, bfloat16 works fine
-        expected_output = "This image shows a long wooden dock extending out into a lake. The dock is made of wooden planks and has a railing"
+        expected_output = Expectations({
+            ("cuda", None): "This image shows a long wooden dock extending out into a lake. The dock is made of wooden planks and has a railing",
+            ("rocm", (9, 5)): "The image shows a long, red, octagonal stop sign with the word \"STOP\" in white letters. The sign is",
+        }).get_expectation()  # fmt: skip

        self.assertEqual(
            decoded_output,