mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
add missing is_decoder param (#20631)
This commit is contained in:
parent
7586a1a367
commit
4f78bcb287
@ -114,6 +114,8 @@ class BertConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -60,6 +60,8 @@ class BertGenerationConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -70,6 +70,8 @@ class BigBirdConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -81,6 +81,8 @@ class CamembertConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -73,6 +73,8 @@ class Data2VecTextConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -87,6 +87,8 @@ class ErnieConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -79,6 +79,8 @@ class EsmConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -70,6 +70,8 @@ class MegatronBertConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -48,6 +48,8 @@ class NezhaConfig(PretrainedConfig):
|
||||
The epsilon used by the layer normalization layers.
|
||||
classifier_dropout (`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for attached classifiers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -65,6 +65,8 @@ class QDQBertConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -76,6 +76,8 @@ class RemBertConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -79,6 +79,8 @@ class RobertaConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -64,6 +64,8 @@ class RoCBertConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -84,6 +84,8 @@ class RoFormerConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
@ -88,6 +88,8 @@ class XLMRobertaConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
|
Loading…
Reference in New Issue
Block a user