diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py index 89b2f5fb648..6e82a59935d 100644 --- a/src/transformers/models/rwkv/configuration_rwkv.py +++ b/src/transformers/models/rwkv/configuration_rwkv.py @@ -61,7 +61,7 @@ class RwkvConfig(PretrainedConfig): Dimensionality of the attention hidden states. Will default to `hidden_size` if unset. intermediate_size (`int`, *optional*): Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): The epsilon to use in the layer normalization layers. bos_token_id (`int`, *optional*, defaults to 0): The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer @@ -69,7 +69,7 @@ class RwkvConfig(PretrainedConfig): eos_token_id (`int`, *optional*, defaults to 0): The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as GPTNeoX. - rescale_every (`int`, *optional*, default to 6): + rescale_every (`int`, *optional*, defaults to 6): At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every `rescale_every` layer. If set to 0 or a negative number, no rescale is done. tie_word_embeddings (`bool`, *optional*, defaults to `False`): diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 4a8d53fed96..7b8a2ad2ad5 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -471,7 +471,6 @@ OBJECTS_TO_IGNORE = [ "RobertaPreLayerNormConfig", "RobertaPreLayerNormModel", "RobertaTokenizerFast", - "RwkvConfig", "SEWConfig", "SEWDConfig", "SEWDForCTC",