diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py index e3dcb63011d..7a690dceb1c 100644 --- a/src/transformers/models/mamba2/configuration_mamba2.py +++ b/src/transformers/models/mamba2/configuration_mamba2.py @@ -83,8 +83,6 @@ class Mamba2Config(PretrainedConfig): Whether or not to rescale `out_proj` weights when initializing. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the cache should be used. - norm_before_gate (`bool`, *optional*, defaults to `True`): - Option of cuda kernels -whether to normalize before the gate or not. rms_norm (`bool`, *optional*, defaults to `True`): Whether to use RMS norm or not. chunk_size (`int`, *optional*, defaults to 256): @@ -137,7 +135,6 @@ class Mamba2Config(PretrainedConfig): time_step_limit=(0.0, float("inf")), rescale_prenorm_residual=False, use_cache=True, - norm_before_gate=True, rms_norm=True, chunk_size=256, tie_word_embeddings=False, @@ -168,7 +165,6 @@ class Mamba2Config(PretrainedConfig): self.n_groups = n_groups self.num_heads = num_heads self.head_dim = head_dim - self.norm_before_gate = norm_before_gate self.rms_norm = rms_norm self.state_size = state_size self.chunk_size = chunk_size diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index bf993ad2f31..69390ea9ad2 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -208,7 +208,6 @@ class Mamba2Mixer(nn.Module): self.activation = config.hidden_act self.act = ACT2FN[config.hidden_act] - self.norm_before_gate = config.norm_before_gate self.layer_norm_epsilon = config.layer_norm_epsilon self.rms_norm = config.rms_norm @@ -347,7 +346,7 @@ class Mamba2Mixer(nn.Module): outproj_bias=self.out_proj.bias, headdim=self.head_dim, ngroups=self.n_groups, - norm_before_gate=self.norm_before_gate, + norm_before_gate=False, return_final_states=True, **dt_limit_kwargs, )