mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
distilbert docstring
This commit is contained in:
parent
9678c49419
commit
b94f73bab7
@ -746,8 +746,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
|
||||
class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``Numpy array`` or ``tf.Tensor`` of shape ``(1,)``:
|
||||
Next sequence prediction (classification) loss.
|
||||
**seq_relationship_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, 2)``
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
|
@ -125,12 +125,12 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
input_ids: torch.tensor(bs, max_seq_length)
|
||||
input_ids: tf.Tensor(bs, max_seq_length)
|
||||
The token ids to embed.
|
||||
|
||||
Outputs
|
||||
-------
|
||||
embeddings: torch.tensor(bs, max_seq_length, dim)
|
||||
embeddings: tf.Tensor(bs, max_seq_length, dim)
|
||||
The embedded tokens (plus position embeddings, no token_type embeddings)
|
||||
"""
|
||||
if not isinstance(inputs, (tuple, list)):
|
||||
@ -192,16 +192,16 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
query: torch.tensor(bs, seq_length, dim)
|
||||
key: torch.tensor(bs, seq_length, dim)
|
||||
value: torch.tensor(bs, seq_length, dim)
|
||||
mask: torch.tensor(bs, seq_length)
|
||||
query: tf.Tensor(bs, seq_length, dim)
|
||||
key: tf.Tensor(bs, seq_length, dim)
|
||||
value: tf.Tensor(bs, seq_length, dim)
|
||||
mask: tf.Tensor(bs, seq_length)
|
||||
|
||||
Outputs
|
||||
-------
|
||||
weights: torch.tensor(bs, n_heads, seq_length, seq_length)
|
||||
weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||
Attention weights
|
||||
context: torch.tensor(bs, seq_length, dim)
|
||||
context: tf.Tensor(bs, seq_length, dim)
|
||||
Contextualized layer. Optional: only if `output_attentions=True`
|
||||
"""
|
||||
query, key, value, mask, head_mask = inputs
|
||||
@ -290,14 +290,14 @@ class TFTransformerBlock(tf.keras.layers.Layer):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
x: torch.tensor(bs, seq_length, dim)
|
||||
attn_mask: torch.tensor(bs, seq_length)
|
||||
x: tf.Tensor(bs, seq_length, dim)
|
||||
attn_mask: tf.Tensor(bs, seq_length)
|
||||
|
||||
Outputs
|
||||
-------
|
||||
sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
|
||||
sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||
The attention weights
|
||||
ffn_output: torch.tensor(bs, seq_length, dim)
|
||||
ffn_output: tf.Tensor(bs, seq_length, dim)
|
||||
The output of the transformer block contextualization.
|
||||
"""
|
||||
x, attn_mask, head_mask = inputs
|
||||
@ -335,19 +335,19 @@ class TFTransformer(tf.keras.layers.Layer):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
x: torch.tensor(bs, seq_length, dim)
|
||||
x: tf.Tensor(bs, seq_length, dim)
|
||||
Input sequence embedded.
|
||||
attn_mask: torch.tensor(bs, seq_length)
|
||||
attn_mask: tf.Tensor(bs, seq_length)
|
||||
Attention mask on the sequence.
|
||||
|
||||
Outputs
|
||||
-------
|
||||
hidden_state: torch.tensor(bs, seq_length, dim)
|
||||
hidden_state: tf.Tensor(bs, seq_length, dim)
|
||||
Sequence of hiddens states in the last (top) layer
|
||||
all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
|
||||
all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
|
||||
Tuple of length n_layers with the hidden states from each layer.
|
||||
Optional: only if output_hidden_states=True
|
||||
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
|
||||
all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
|
||||
Tuple of length n_layers with the attention weights from each layer
|
||||
Optional: only if output_attentions=True
|
||||
"""
|
||||
@ -384,27 +384,6 @@ class TFTransformer(tf.keras.layers.Layer):
|
||||
|
||||
|
||||
class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFDistilBertMainLayer, self).__init__(**kwargs)
|
||||
self.num_hidden_layers = config.num_hidden_layers
|
||||
@ -477,9 +456,31 @@ DISTILBERT_START_DOCSTRING = r"""
|
||||
For more information on DistilBERT, please refer to our
|
||||
`detailed blog post`_
|
||||
|
||||
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||
|
||||
.. _`detailed blog post`:
|
||||
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||
|
||||
.. _`tf.keras.Model`:
|
||||
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||
|
||||
Note on the model inputs:
|
||||
TF 2.0 models accepts two formats as inputs:
|
||||
|
||||
- having all inputs as keyword arguments (like PyTorch models), or
|
||||
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||
|
||||
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||
|
||||
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||
|
||||
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
@ -488,16 +489,16 @@ DISTILBERT_START_DOCSTRING = r"""
|
||||
|
||||
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
**input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
|
||||
|
||||
For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
|
||||
**attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
@ -508,21 +509,24 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||
class TFDistilBertModel(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from pytorch_transformers import DistilBertTokenizer, TFDistilBertModel
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
@ -562,32 +566,27 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from pytorch_transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
prediction_scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
@ -620,33 +619,27 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from pytorch_transformers import BertTokenizer, TFDistilBertForSequenceClassification
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
logits = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
@ -676,39 +669,31 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-start scores (before SoftMax).
|
||||
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-end scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from pytorch_transformers import BertTokenizer, TFDistilBertForQuestionAnswering
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
start_positions = tf.constant([1])
|
||||
end_positions = tf.constant([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
|
Loading…
Reference in New Issue
Block a user