LLaVa: add cache class attribute (#32278)

cache class flag
2025-07-31 02:02:21 +06:00 · 2024-08-01 09:48:03 +05:00 · 2024-08-01 09:48:03 +05:00 · 453e74884f
commit 453e74884f
parent 14ee2326e5
6 changed files with 6 additions and 0 deletions
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@ -126,6 +126,7 @@ class LlavaPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["LlavaVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        # important: this ported version of Llava isn't meant for training from scratch - only
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@ -232,6 +232,7 @@ class LlavaNextPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["LlavaNextVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        # important: this ported version of LlavaNext isn't meant for training from scratch - only
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@ -272,6 +272,7 @@ class LlavaNextVideoPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["LlavaNextVideoVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        # important: this ported version of LlavaNextVideo isn't meant for training from scratch - only
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@ -127,6 +127,7 @@ class PaliGemmaPreTrainedModel(PreTrainedModel):
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = False
    _supports_sdpa = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        # important: this ported version of PaliGemmaisn't meant for training from scratch - only
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@ -126,6 +126,7 @@ class VideoLlavaPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["VideoLlavaVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        std = (
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@ -135,6 +135,7 @@ class VipLlavaPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["VipLlavaVisionAttention"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
+    _supports_cache_class = True

    def _init_weights(self, module):
        # important: this ported version of VipLlava isn't meant for training from scratch - only