diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 74976bdd340..98d9064dc1a 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -36,15 +36,15 @@ class Gemma2Config(PretrainedConfig): vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`Gemma2Model`] - hidden_size (`int`, *optional*, defaults to 3072): + hidden_size (`int`, *optional*, defaults to 2304): Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 24576): + intermediate_size (`int`, *optional*, defaults to 9216): Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 28): + num_hidden_layers (`int`, *optional*, defaults to 26): Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 16): + num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): + num_key_value_heads (`int`, *optional*, defaults to 4): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When @@ -80,7 +80,7 @@ class Gemma2Config(PretrainedConfig): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores + query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits. @@ -103,11 +103,11 @@ class Gemma2Config(PretrainedConfig): def __init__( self, vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, head_dim=256, hidden_activation="gelu_pytorch_tanh", max_position_embeddings=8192, @@ -121,7 +121,7 @@ class Gemma2Config(PretrainedConfig): rope_theta=10000.0, attention_bias=False, attention_dropout=0.0, - query_pre_attn_scalar=224, + query_pre_attn_scalar=256, sliding_window=4096, final_logit_softcapping=30.0, attn_logit_softcapping=50.0, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 7ddb1c9f4c9..dacaca1c7ef 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -67,15 +67,15 @@ class Gemma2Config(PretrainedConfig): vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`Gemma2Model`] - hidden_size (`int`, *optional*, defaults to 3072): + hidden_size (`int`, *optional*, defaults to 2304): Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 24576): + intermediate_size (`int`, *optional*, defaults to 9216): Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 28): + num_hidden_layers (`int`, *optional*, defaults to 26): Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 16): + num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): + num_key_value_heads (`int`, *optional*, defaults to 4): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When @@ -111,7 +111,7 @@ class Gemma2Config(PretrainedConfig): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores + query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits. @@ -134,11 +134,11 @@ class Gemma2Config(PretrainedConfig): def __init__( self, vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, head_dim=256, hidden_activation="gelu_pytorch_tanh", max_position_embeddings=8192, @@ -152,7 +152,7 @@ class Gemma2Config(PretrainedConfig): rope_theta=10000.0, attention_bias=False, attention_dropout=0.0, - query_pre_attn_scalar=224, + query_pre_attn_scalar=256, sliding_window=4096, final_logit_softcapping=30.0, attn_logit_softcapping=50.0,