mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Updated Configurations
This commit is contained in:
parent
2b566c182e
commit
632682726f
@ -1,7 +1,7 @@
|
||||
ALBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``AlbrtConfig``
|
||||
``AlbertConfig``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertConfig
|
||||
|
@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class AlbertConfig(PretrainedConfig):
|
||||
"""Configuration for `AlbertModel`.
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
|
||||
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the ALBERT xxlarge architecture.
|
||||
|
||||
The default settings match the configuration of model `albert_xxlarge`.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30000):
|
||||
Vocabulary size of the ALBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
Size of vocabulary embeddings.
|
||||
hidden_size (:obj:`int`, optional, defaults to 4096):
|
||||
Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_hidden_groups (:obj:`int`, optional, defaults to 1):
|
||||
Number of groups for the hidden layers, parameters in the same group are shared.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 64):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 16384):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
inner_group_num (:obj:`int`, optional, defaults to 1):
|
||||
The number of inner repetition of attention and ffn.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something
|
||||
large (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
|
||||
Example::
|
||||
|
||||
# Initializing an ALBERT-xxlarge style configuration
|
||||
albert_xxlarge_configuration = AlbertConfig()
|
||||
|
||||
# Initializing an ALBERT-base style configuration
|
||||
albert_base_configuration = AlbertConfig(
|
||||
hidden_size=768,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
)
|
||||
|
||||
# Initializing a model from the ALBERT-base style configuration
|
||||
model = AlbertModel(bert_base_configuration)
|
||||
|
||||
# Accessing the model configuration
|
||||
configuration = model.config
|
||||
|
||||
Attributes:
|
||||
pretrained_config_archive_map (Dict[str, str]):
|
||||
A dictionary containing all the available pre-trained checkpoints.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig):
|
||||
layer_norm_eps=1e-12,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs AlbertConfig.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
|
||||
embedding_size: size of voc embeddings.
|
||||
hidden_size: Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||
num_hidden_groups: Number of group for the hidden layers, parameters in
|
||||
the same group are shared.
|
||||
num_attention_heads: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
inner_group_num: int, number of inner repetition of attention and ffn.
|
||||
down_scale_factor: float, the scale to apply
|
||||
hidden_act: The non-linear activation function (function or string) in the
|
||||
encoder and pooler.
|
||||
hidden_dropout_prob: The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||
probabilities.
|
||||
max_position_embeddings: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||
`AlbertModel`.
|
||||
initializer_range: The stdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(AlbertConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
|
@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
|
||||
|
||||
class AutoConfig(object):
|
||||
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
||||
r"""
|
||||
:class:`~transformers.AutoConfig` is a generic configuration class
|
||||
that will be instantiated as one of the configuration classes of the library
|
||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
|
||||
|
||||
The `from_pretrained()` method take care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `albert`: AlbertConfig (ALBERT model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string argument.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@ -94,6 +78,8 @@ class AutoConfig(object):
|
||||
return DistilBertConfig(*args, **kwargs)
|
||||
elif "roberta" in model_type:
|
||||
return RobertaConfig(*args, **kwargs)
|
||||
elif "albert" in model_type:
|
||||
return AlbertConfig(*args, **kwargs)
|
||||
elif "bert" in model_type:
|
||||
return BertConfig(*args, **kwargs)
|
||||
elif "openai-gpt" in model_type:
|
||||
@ -108,8 +94,6 @@ class AutoConfig(object):
|
||||
return XLMConfig(*args, **kwargs)
|
||||
elif "ctrl" in model_type:
|
||||
return CTRLConfig(*args, **kwargs)
|
||||
elif "albert" in model_type:
|
||||
return AlbertConfig(*args, **kwargs)
|
||||
elif "camembert" in model_type:
|
||||
return CamembertConfig(*args, **kwargs)
|
||||
raise ValueError(
|
||||
@ -120,59 +104,60 @@ class AutoConfig(object):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
r""" Instantiate a one of the configuration classes of the library
|
||||
r""" Instantiates one of the configuration classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The configuration class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `t5`: T5Config (T5 model)
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `albert`: AlbertConfig (ALBERT model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- contains `t5`: :class:`~transformers.T5Config` (T5 model)
|
||||
- contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
|
||||
- contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
|
||||
- contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
|
||||
- contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
|
||||
- contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
|
||||
- contains `bert`: :class:`~transformers.BertConfig` (Bert model)
|
||||
- contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
|
||||
- contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
|
||||
- contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
|
||||
- contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
|
||||
- contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Args:
|
||||
pretrained_model_name_or_path (:obj:`string`):
|
||||
Is either: \
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
cache_dir (:obj:`string`, optional, defaults to `None`):
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
||||
force_download (:obj:`boolean`, optional, defaults to `False`):
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
|
||||
|
||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||
resume_download (:obj:`boolean`, optional, defaults to `False`):
|
||||
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
return_unused_kwargs: (`optional`) bool:
|
||||
proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
|
||||
The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
|
||||
|
||||
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
|
||||
- If False, then this function returns just the final configuration object.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
|
||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
|
||||
Examples::
|
||||
|
||||
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
||||
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||
|
@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
class BertConfig(PretrainedConfig):
|
||||
r"""
|
||||
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
|
||||
`BertModel`.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
|
||||
It is used to instantiate an BERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT bert-base-uncased architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Arguments:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||
hidden_size: Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
hidden_act: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||
probabilities.
|
||||
max_position_embeddings: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||
`BertModel`.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_eps: The epsilon used by LayerNorm.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||
Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
"""
|
||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig):
|
||||
**kwargs
|
||||
):
|
||||
super(BertConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
|
@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class CamembertConfig(RobertaConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
|
||||
It is used to instantiate an Camembert model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT bert-base-uncased architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
"""
|
||||
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
|
||||
|
||||
|
||||
class CTRLConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `CTRLModel`.
|
||||
"""
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
|
||||
It is used to instantiate an CTRL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the CTRL architecture from SalesForce.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
dff: Size of the inner dimension of the FFN.
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 246534):
|
||||
Vocabulary size of the CTRL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 256):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 256):
|
||||
Size of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 1280):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
dff (:obj:`int`, optional, defaults to 8192):
|
||||
Size of the inner dimension of the FFN.
|
||||
n_layer (:obj:`int`, optional, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig):
|
||||
summary_first_dropout=0.1,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs CTRLConfig.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
dff: Size of the inner dimension of the FFN.
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(CTRLConfig, self).__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
|
@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class DistilBertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
|
||||
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the DistilBERT distilbert-base-uncased architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the DistilBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings.
|
||||
n_layers (:obj:`int`, optional, defaults to 6):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dim (:obj:`int`, optional, defaults to 768):
|
||||
Size of the encoder layers and the pooler layer.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
qa_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilities used in the question answering model
|
||||
:class:`~tranformers.DistilBertForQuestionAnswering`.
|
||||
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
|
||||
The dropout probabilities used in the sequence classification model
|
||||
:class:`~tranformers.DistilBertForSequenceClassification`.
|
||||
"""
|
||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(
|
||||
@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig):
|
||||
attention_dropout=0.1,
|
||||
activation="gelu",
|
||||
initializer_range=0.02,
|
||||
tie_weights_=True,
|
||||
qa_dropout=0.1,
|
||||
seq_classif_dropout=0.2,
|
||||
**kwargs
|
||||
@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig):
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation = activation
|
||||
self.initializer_range = initializer_range
|
||||
self.tie_weights_ = tie_weights_
|
||||
self.qa_dropout = qa_dropout
|
||||
self.seq_classif_dropout = seq_classif_dropout
|
||||
|
||||
|
@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class GPT2Config(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
|
||||
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the GPT-2 small architecture.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 50257):
|
||||
Vocabulary size of the GPT-2 model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
|
||||
n_positions (:obj:`int`, optional, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 1024):
|
||||
Size of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 16):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig):
|
||||
summary_first_dropout=0.1,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs GPT2Config.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(GPT2Config, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
|
@ -26,9 +26,13 @@ class MMBTConfig(object):
|
||||
"""Configuration class to store the configuration of a `MMBT Model`.
|
||||
|
||||
Args:
|
||||
config: config of the underlying Transformer models. It's values are copied over to use a single config.
|
||||
num_labels: Size of final Linear layer for classification.
|
||||
modal_hidden_size: Embedding dimension of the non-text modality encoder.
|
||||
config (:obj:`~transformers.PreTrainedConfig`):
|
||||
Config of the underlying Transformer models. Its values are
|
||||
copied over to use a single config.
|
||||
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
|
||||
Size of final Linear layer for classification.
|
||||
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
|
||||
Embedding dimension of the non-text modality encoder.
|
||||
"""
|
||||
|
||||
def __init__(self, config, num_labels=None, modal_hidden_size=2048):
|
||||
|
@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
class OpenAIGPTConfig(PretrainedConfig):
|
||||
"""
|
||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
|
||||
It is used to instantiate an GPT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the GPT architecture from OpenAI.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
afn: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 40478):
|
||||
Vocabulary size of the GPT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 512):
|
||||
Size of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether special tokens should be predicted when the model is has a language modeling head.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
summary_first_dropout=0.1,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs OpenAIGPTConfig.
|
||||
"""
|
||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
|
@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class RobertaConfig(BertConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
|
||||
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT bert-base-uncased architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
"""
|
||||
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class TransfoXLConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
||||
"""
|
||||
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
|
||||
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the Transformer XL architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||
cutoffs: cutoffs for the adaptive softmax
|
||||
d_model: Dimensionality of the model's hidden states.
|
||||
d_embed: Dimensionality of the embeddings
|
||||
d_head: Dimensionality of the model's heads.
|
||||
div_val: divident value for adapative input and softmax
|
||||
pre_lnorm: apply LayerNorm to the input instead of the output
|
||||
d_inner: Inner dimension in FF
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
tgt_len: number of tokens to predict
|
||||
ext_len: length of the extended context
|
||||
mem_len: length of the retained previous heads
|
||||
same_length: use the same attn length for all tokens
|
||||
proj_share_all_but_first: True to share all but first projs, False not to share.
|
||||
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||
clamp_len: use the same pos embeddings after clamp_len
|
||||
sample_softmax: number of samples in sampled softmax
|
||||
adaptive: use adaptive softmax
|
||||
tie_weight: tie the word embedding and softmax weights
|
||||
dropout: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
dropatt: The dropout ratio for the attention probabilities.
|
||||
untie_r: untie relative position biases
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
init: parameter initializer to use
|
||||
init_range: parameters initialized by U(-init_range, init_range).
|
||||
proj_init_std: parameters initialized by N(0, init_std)
|
||||
init_std: parameters initialized by N(0, init_std)
|
||||
vocab_size (:obj:`int`, optional, defaults to 267735):
|
||||
Vocabulary size of the Transformer XL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
|
||||
cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
|
||||
Cutoffs for the adaptive softmax
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the model's hidden states.
|
||||
d_embed (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the embeddings
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_head (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the model's heads.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
Inner dimension in FF
|
||||
div_val (:obj:`int`, optional, defaults to 4):
|
||||
Divident value for adapative input and softmax
|
||||
pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Apply LayerNorm to the input instead of the output
|
||||
n_layer (:obj:`int`, optional, defaults to 18):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
tgt_len (:obj:`int`, optional, defaults to 128):
|
||||
Number of tokens to predict
|
||||
ext_len (:obj:`int`, optional, defaults to 0):
|
||||
Length of the extended context
|
||||
mem_len (:obj:`int`, optional, defaults to 1600):
|
||||
Length of the retained previous heads
|
||||
clamp_len (:obj:`int`, optional, defaults to 1000):
|
||||
use the same pos embeddings after clamp_len
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Use the same attn length for all tokens
|
||||
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
True to share all but first projs, False not to share.
|
||||
attn_type (:obj:`int`, optional, defaults to 0):
|
||||
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||
sample_softmax (:obj:`int`, optional, defaults to -1):
|
||||
number of samples in sampled softmax
|
||||
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
use adaptive softmax
|
||||
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
tie the word embedding and softmax weights
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
dropatt (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
init (:obj:`string`, optional, defaults to `normal`):
|
||||
Parameter initializer to use
|
||||
init_range (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by U(-init_range, init_range).
|
||||
proj_init_std (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by N(0, init_std)
|
||||
init_std (:obj:`float`, optional, defaults to 0.02):
|
||||
Parameters initialized by N(0, init_std)
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
layer_norm_epsilon=1e-5,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs TransfoXLConfig.
|
||||
"""
|
||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.cutoffs = []
|
||||
self.cutoffs.extend(cutoffs)
|
||||
|
@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class XLMConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `XLMModel`.
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||
d_model: Size of the encoder layers and the pooler layer.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
ff_activation: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
untie_r: untie relative position biases
|
||||
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
dropout: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
max_position_embeddings: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_eps: The epsilon used by LayerNorm.
|
||||
|
||||
dropout: float, dropout rate.
|
||||
init: str, the initialization scheme, either "normal" or "uniform".
|
||||
init_range: float, initialize the parameters with a uniform distribution
|
||||
in [-init_range, init_range]. Only effective when init="uniform".
|
||||
init_std: float, initialize the parameters with a normal distribution
|
||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
||||
mem_len: int, the number of tokens to cache.
|
||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
||||
and reused in the future.
|
||||
bi_data: bool, whether to use bidirectional input pipeline.
|
||||
Usually set to True during pretraining and False during finetuning.
|
||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
||||
-1 means no clamping.
|
||||
same_length: bool, whether to use the same attention length for each token.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the XLM model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Set this to `True` for the model to behave in a causal manner.
|
||||
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||
if a bidirectional context.
|
||||
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
TODO
|
||||
n_langs (:obj:`int`, optional, defaults to 1):
|
||||
The number of languages the model handles. Set to 1 for monolingual models.
|
||||
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||
for information on how to use them.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing the embedding matrices.
|
||||
init_std (:obj:`int`, optional, defaults to 50257):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices except the embedding matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
bos_index (:obj:`int`, optional, defaults to 0):
|
||||
The index of the beginning of sentence token in the vocabulary.
|
||||
eos_index (:obj:`int`, optional, defaults to 1):
|
||||
The index of the end of sentence token in the vocabulary.
|
||||
pad_index (:obj:`int`, optional, defaults to 2):
|
||||
The index of the padding token in the vocabulary.
|
||||
unk_index (:obj:`int`, optional, defaults to 3):
|
||||
The index of the unknown token in the vocabulary.
|
||||
mask_index (:obj:`int`, optional, defaults to 5):
|
||||
The index of the masking token in the vocabulary.
|
||||
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
TODO
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
TODO
|
||||
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||
lang_id (:obj:`int`, optional, defaults to 1):
|
||||
The ID of the language used by the model. This parameter is used when generating
|
||||
text in a given language.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class XLNetConfig(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
|
||||
It is used to instantiate an XLNet model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||
d_model: Size of the encoder layers and the pooler layer.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
ff_activation: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
untie_r: untie relative position biases
|
||||
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
dropout: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_eps: The epsilon used by LayerNorm.
|
||||
|
||||
dropout: float, dropout rate.
|
||||
init: str, the initialization scheme, either "normal" or "uniform".
|
||||
init_range: float, initialize the parameters with a uniform distribution
|
||||
in [-init_range, init_range]. Only effective when init="uniform".
|
||||
init_std: float, initialize the parameters with a normal distribution
|
||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
||||
mem_len: int, the number of tokens to cache.
|
||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
||||
and reused in the future.
|
||||
bi_data: bool, whether to use bidirectional input pipeline.
|
||||
Usually set to True during pretraining and False during finetuning.
|
||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
||||
-1 means no clamping.
|
||||
same_length: bool, whether to use the same attention length for each token.
|
||||
finetuning_task: name of the glue task on which the model was fine-tuned if any
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 32000):
|
||||
Vocabulary size of the XLNet model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Size of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
ff_activation (:obj:`string`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
attn_type (:obj:`string`, optional, defaults to "bi"):
|
||||
The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens to cache. The key/value pairs that have already been pre-computed
|
||||
in a previous forward pass won't be re-computed. See the
|
||||
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
|
||||
for more information.
|
||||
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens in the current batch to be cached and reused in the future.
|
||||
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use bidirectional input pipeline. Usually set to `True` during
|
||||
pretraining and `False` during finetuning.
|
||||
clamp_len (:obj:`int`, optional, defaults to -1):
|
||||
Clamp all relative distances larger than clamp_len.
|
||||
Setting this attribute to -1 means no clamping.
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use the same attention length for each token.
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
TODO
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
TODO
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
Loading…
Reference in New Issue
Block a user