mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
clean up unused classifier_dropout
in config (#20596)
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
eefae413d1
commit
4430b91298
@ -71,8 +71,6 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 128):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
@ -131,7 +129,6 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=1,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@ -156,7 +153,6 @@ class BlenderbotConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
@ -71,8 +71,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
@ -131,7 +129,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=1,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@ -155,7 +152,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
@ -87,8 +87,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
classifier_dropout (`float`, *optional*):
|
||||
The dropout ratio for the classification head.
|
||||
|
||||
Example:
|
||||
|
||||
@ -124,7 +122,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
||||
pad_token_id=0,
|
||||
position_embedding_type="absolute",
|
||||
use_cache=True,
|
||||
classifier_dropout=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
@ -144,7 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
||||
self.classifier_dropout = classifier_dropout
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||
|
@ -69,8 +69,6 @@ class MarianConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
@ -130,7 +128,6 @@ class MarianConfig(PretrainedConfig):
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=58100,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
pad_token_id=58100,
|
||||
eos_token_id=0,
|
||||
@ -155,7 +152,6 @@ class MarianConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
@ -64,8 +64,6 @@ class PegasusConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
@ -124,7 +122,6 @@ class PegasusConfig(PretrainedConfig):
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=0,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
pad_token_id=0,
|
||||
eos_token_id=1,
|
||||
@ -147,7 +144,6 @@ class PegasusConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
@ -60,8 +60,6 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
@ -109,7 +107,6 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=2,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig):
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = decoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
Loading…
Reference in New Issue
Block a user