mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 03:01:07 +06:00
add missing is_decoder param (#20631)
This commit is contained in:
parent
7586a1a367
commit
4f78bcb287
@ -114,6 +114,8 @@ class BertConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -60,6 +60,8 @@ class BertGenerationConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -70,6 +70,8 @@ class BigBirdConfig(PretrainedConfig):
|
|||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -81,6 +81,8 @@ class CamembertConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -73,6 +73,8 @@ class Data2VecTextConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -87,6 +87,8 @@ class ErnieConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -79,6 +79,8 @@ class EsmConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -70,6 +70,8 @@ class MegatronBertConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -48,6 +48,8 @@ class NezhaConfig(PretrainedConfig):
|
|||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
classifier_dropout (`float`, optional, defaults to 0.1):
|
classifier_dropout (`float`, optional, defaults to 0.1):
|
||||||
The dropout ratio for attached classifiers.
|
The dropout ratio for attached classifiers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -65,6 +65,8 @@ class QDQBertConfig(PretrainedConfig):
|
|||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -76,6 +76,8 @@ class RemBertConfig(PretrainedConfig):
|
|||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -79,6 +79,8 @@ class RobertaConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -64,6 +64,8 @@ class RoCBertConfig(PretrainedConfig):
|
|||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -84,6 +84,8 @@ class RoFormerConfig(PretrainedConfig):
|
|||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
@ -88,6 +88,8 @@ class XLMRobertaConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
|
is_decoder (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
|
Loading…
Reference in New Issue
Block a user