chore(pixtral): emit block attention mask when using flash attention (#38741)

* chore(pixtral): emit block attention mask when using flash attention Since flash_attention_2 relies solely on position_ids, emitting the block attention mask avoids unnecessary memory usage and prevents OOM on large inputs. * remove unnecessary attention_mask assignment
2025-07-31 02:02:21 +06:00 · 2025-06-11 14:55:23 -04:00 · 2025-06-11 14:55:23 -04:00 · 1dcb022e8f
commit 1dcb022e8f
parent 60d4b35b20
1 changed files with 7 additions and 4 deletions
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@ -214,7 +214,6 @@ class PixtralAttention(nn.Module):
        # Since we use packing, if flash_attention_2 is selected we rely on position_ids
        if self.config._attn_implementation == "flash_attention_2":
            kwargs["position_ids"] = kwargs["position_ids"].to(hidden_states.device, non_blocking=True)
-            attention_mask = None

        attn_output, attn_weights = attention_interface(
            self,
@ -508,9 +507,13 @@ class PixtralVisionModel(PixtralPreTrainedModel):

        position_embeddings = self.patch_positional_embedding(patch_embeds, position_ids)

-        attention_mask = generate_block_attention_mask(
-            [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
-        )
+        if self.config._attn_implementation == "flash_attention_2":
+            # We only rely on position_ids when using flash_attention_2
+            attention_mask = None
+        else:
+            attention_mask = generate_block_attention_mask(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+            )

        return self.transformer(
            patch_embeds,