diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 1eb50ee4ad7..72853d4ca4d 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -14,6 +14,8 @@ ARG PYTORCH='2.6.0' ARG INTEL_TORCH_EXT='2.3.0' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu121' +# Disable kernel mapping for now until all tests pass +ENV DISABLE_KERNEL_MAPPING=1 RUN apt update RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index d64e2746d49..fdb825cad37 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -228,7 +228,6 @@ class AriaProjector(nn.Module): return out -@use_kernel_forward_from_hub("MLP") class AriaSharedExpertsMLP(nn.Module): """ Shared Expert MLP for shared experts. diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 0cf23edb751..8fd2483bcd6 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -882,7 +882,6 @@ class BambaMixer(nn.Module): return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) -@use_kernel_forward_from_hub("MLP") class BambaMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index fd888c38d7f..8cbb7128c73 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -36,7 +36,6 @@ from torch import nn from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast @@ -118,7 +117,6 @@ class CohereRotaryEmbedding(nn.Module): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -@use_kernel_forward_from_hub("MLP") class CohereMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index e419379969d..18a3a50ac15 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -28,7 +28,6 @@ import torch.nn as nn from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update @@ -268,7 +267,6 @@ class Cohere2Attention(nn.Module): return attn_output, attn_weights -@use_kernel_forward_from_hub("MLP") class Cohere2MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index ed536cbebaf..e7fecb4be6a 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -74,7 +74,6 @@ _CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut" _CONFIG_FOR_DOC = "DiffLlamaConfig" -@use_kernel_forward_from_hub("MLP") class DiffLlamaMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 4646b9f9bde..fcc55b67d15 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -84,7 +84,6 @@ class Emu3RMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -@use_kernel_forward_from_hub("MLP") class Emu3MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 679bc086985..40497433284 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -27,7 +27,6 @@ from torch import nn from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import ( @@ -85,7 +84,6 @@ class GemmaRMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.eps}" -@use_kernel_forward_from_hub("MLP") class GemmaMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index c7040de011b..144a94ef33e 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -28,7 +28,6 @@ import torch.nn as nn from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import ( BaseModelOutputWithPast, @@ -78,7 +77,6 @@ class Gemma2RMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.eps}" -@use_kernel_forward_from_hub("MLP") class Gemma2MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 23f28281a1d..0988e2692aa 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -31,7 +31,6 @@ import torch.nn as nn from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update @@ -107,7 +106,6 @@ class Gemma3TextScaledWordEmbedding(nn.Embedding): return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype) -@use_kernel_forward_from_hub("MLP") class Gemma3MLP(nn.Module): def __init__(self, config: Gemma3TextConfig): super().__init__() diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 6f15f9ca095..80d3ad696dc 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -228,7 +228,6 @@ class GraniteRMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -@use_kernel_forward_from_hub("MLP") class GraniteMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 2597ce27fa9..d565af9e27f 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -29,7 +29,6 @@ import torch.nn as nn from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import ( @@ -118,7 +117,6 @@ class HeliumRotaryEmbedding(nn.Module): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -@use_kernel_forward_from_hub("MLP") class HeliumMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index e8dd1395266..d36fb1b6a47 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -160,7 +160,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): return q_embed, k_embed -@use_kernel_forward_from_hub("MLP") class LlamaMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 8f1b416d5b1..7f88b8d8570 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -45,7 +45,6 @@ _CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1" _CONFIG_FOR_DOC = "MistralConfig" -@use_kernel_forward_from_hub("MLP") class MistralMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 8b8783d1ad8..5b6ca9f4b35 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -14,7 +14,6 @@ import torch.nn.functional as F from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin -from ...integrations import use_kernel_forward_from_hub from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast @@ -58,7 +57,6 @@ class OlmoLayerNorm(nn.Module): ) -@use_kernel_forward_from_hub("MLP") class OlmoMLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index bcf990ccda6..4046dc58267 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -218,7 +218,6 @@ class Olmo2Attention(nn.Module): return attn_output, attn_weights -@use_kernel_forward_from_hub("MLP") class Olmo2MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index d3180b35b3a..7b62632bd8e 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -45,7 +45,6 @@ _CHECKPOINT_FOR_DOC = "meta-qwen2/Qwen2-2-7b-hf" _CONFIG_FOR_DOC = "Qwen2Config" -@use_kernel_forward_from_hub("MLP") class Qwen2MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py index 5852470d1c2..15773b4516a 100644 --- a/src/transformers/models/qwen3/modeling_qwen3.py +++ b/src/transformers/models/qwen3/modeling_qwen3.py @@ -81,7 +81,6 @@ class Qwen3RMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -@use_kernel_forward_from_hub("MLP") class Qwen3MLP(nn.Module): def __init__(self, config): super().__init__()