distilbert-base-cased weights + Readmes + omissions

This commit is contained in:
VictorSanh 2020-02-07 19:19:35 +00:00 committed by Victor SANH
parent 73368963b2
commit ee5a6856ca
11 changed files with 77 additions and 39 deletions

View File

@ -195,7 +195,7 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'), (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'), (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
(RobertaModel, RobertaTokenizer, 'roberta-base'), (RobertaModel, RobertaTokenizer, 'roberta-base'),
(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'), (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
] ]

View File

@ -179,6 +179,14 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. | | | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) | | | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilbert-base-cased`` | | 6-layer, 768-hidden, 12-heads, 65M parameters |
| | | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilbert-base-cased-distilled-squad`` | | 6-layer, 768-hidden, 12-heads, 65M parameters |
| | | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``distilgpt2`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | | | ``distilgpt2`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | | | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. |
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) | | | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |

View File

@ -31,8 +31,10 @@ Here are the results on the dev sets of GLUE:
| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI | | Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI |
| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: |
| BERT-base-uncased | **77.6** | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1 | | BERT-base-uncased | **74.9** | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1 |
| DistilBERT-base-uncased | **76.8** | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3 | | DistilBERT-base-uncased | **74.3** | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3 |
| BERT-base-cased | **78.2** | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5 |
| DistilBERT-base-cased | **75.9** | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> | | RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 | | DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
@ -63,7 +65,9 @@ This part of the library has only be tested with Python3.6+. There are few speci
Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT): Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score).
- `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score). - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base. - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
@ -72,8 +76,8 @@ Transformers includes five pre-trained Distil* models, currently only provided f
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models. Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
```python ```python
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
outputs = model(input_ids) outputs = model(input_ids)
@ -81,6 +85,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of
``` ```
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint: Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
- DistilBERT uncased: `model = DistilBertModel.from_pretrained('distilbert-base-uncased')`
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')` - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')` - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')` - DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`

View File

@ -0,0 +1,15 @@
{
"activation": "gelu",
"attention_dropout": 0.1,
"dim": 768,
"dropout": 0.1,
"hidden_dim": 3072,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"n_heads": 12,
"n_layers": 6,
"sinusoidal_pos_embds": true,
"tie_weights_": true,
"vocab_size": 28996
}

View File

@ -25,6 +25,8 @@ logger = logging.getLogger(__name__)
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
"distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json",
"distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json",
"distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
"distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json",

View File

@ -277,7 +277,7 @@ MODEL_CLASSES = {
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
), ),
"distilbert-base-uncased-distilled-squad": ( "distilbert-base-distilled-squad": (
DistilBertConfig, DistilBertConfig,
TFDistilBertForQuestionAnswering, TFDistilBertForQuestionAnswering,
DistilBertForQuestionAnswering, DistilBertForQuestionAnswering,

View File

@ -38,6 +38,8 @@ logger = logging.getLogger(__name__)
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
"distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-pytorch_model.bin",
"distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-pytorch_model.bin",
"distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
"distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin",
@ -440,8 +442,8 @@ class DistilBertModel(DistilBertPreTrainedModel):
from transformers import DistilBertTokenizer, DistilBertModel from transformers import DistilBertTokenizer, DistilBertModel
import torch import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
@ -544,8 +546,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
from transformers import DistilBertTokenizer, DistilBertForMaskedLM from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
@ -619,8 +621,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
@ -711,8 +713,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1]) start_positions = torch.tensor([1])
end_positions = torch.tensor([3]) end_positions = torch.tensor([3])
@ -798,8 +800,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
from transformers import DistilBertTokenizer, DistilBertForTokenClassification from transformers import DistilBertTokenizer, DistilBertForTokenClassification
import torch import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased') model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)

View File

@ -33,6 +33,8 @@ logger = logging.getLogger(__name__)
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5", "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
"distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-tf_model.h5",
"distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-tf_model.h5",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
"distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5",
} }
@ -78,8 +80,6 @@ class TFEmbeddings(tf.keras.layers.Layer):
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
name="position_embeddings", name="position_embeddings",
) )
if config.sinusoidal_pos_embds:
raise NotImplementedError
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
@ -563,8 +563,8 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
import tensorflow as tf import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
@ -637,8 +637,8 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
import tensorflow as tf import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
prediction_scores = outputs[0] prediction_scores = outputs[0]
@ -701,8 +701,8 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
import tensorflow as tf import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
logits = outputs[0] logits = outputs[0]
@ -759,8 +759,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
import tensorflow as tf import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
scores = outputs[0] scores = outputs[0]
@ -818,8 +818,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
import tensorflow as tf import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
start_scores, end_scores = outputs[:2] start_scores, end_scores = outputs[:2]

View File

@ -941,9 +941,9 @@ SUPPORTED_TASKS = {
"tf": TFAutoModel if is_tf_available() else None, "tf": TFAutoModel if is_tf_available() else None,
"pt": AutoModel if is_torch_available() else None, "pt": AutoModel if is_torch_available() else None,
"default": { "default": {
"model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased"}, "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"},
"config": None, "config": None,
"tokenizer": "distilbert-base-uncased", "tokenizer": "distilbert-base-cased",
}, },
}, },
"sentiment-analysis": { "sentiment-analysis": {
@ -978,11 +978,11 @@ SUPPORTED_TASKS = {
"pt": AutoModelForQuestionAnswering if is_torch_available() else None, "pt": AutoModelForQuestionAnswering if is_torch_available() else None,
"default": { "default": {
"model": { "model": {
"pt": "distilbert-base-uncased-distilled-squad", "pt": "distilbert-base-cased-distilled-squad",
"tf": "distilbert-base-uncased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad",
}, },
"config": None, "config": None,
"tokenizer": "distilbert-base-uncased", "tokenizer": "distilbert-base-cased",
}, },
}, },
"fill-mask": { "fill-mask": {
@ -1015,7 +1015,7 @@ def pipeline(
Examples: Examples:
pipeline('sentiment-analysis') pipeline('sentiment-analysis')
pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased') pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...) pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased') pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased')
pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased') pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')

View File

@ -28,6 +28,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
"distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
"distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
"distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
} }
@ -36,6 +38,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"distilbert-base-uncased": 512, "distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512, "distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-cased": 512,
"distilbert-base-cased-distilled-squad": 512,
"distilbert-base-german-cased": 512, "distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512, "distilbert-base-multilingual-cased": 512,
} }
@ -44,6 +48,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
PRETRAINED_INIT_CONFIGURATION = { PRETRAINED_INIT_CONFIGURATION = {
"distilbert-base-uncased": {"do_lower_case": True}, "distilbert-base-uncased": {"do_lower_case": True},
"distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
"distilbert-base-cased": {"do_lower_case": False},
"distilbert-base-cased-distilled-squad": {"do_lower_case": False},
"distilbert-base-german-cased": {"do_lower_case": False}, "distilbert-base-german-cased": {"do_lower_case": False},
"distilbert-base-multilingual-cased": {"do_lower_case": False}, "distilbert-base-multilingual-cased": {"do_lower_case": False},
} }

View File

@ -10,13 +10,13 @@ from .utils import require_tf, require_torch
QA_FINETUNED_MODELS = { QA_FINETUNED_MODELS = {
("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), ("bert-base-uncased", "distilbert-base-cased-distilled-squad", None),
} }
TF_QA_FINETUNED_MODELS = { TF_QA_FINETUNED_MODELS = {
("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None),
("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None),
("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), ("bert-base-uncased", "distilbert-base-cased-distilled-squad", None),
} }
TF_NER_FINETUNED_MODELS = { TF_NER_FINETUNED_MODELS = {
@ -38,13 +38,13 @@ NER_FINETUNED_MODELS = {
FEATURE_EXTRACT_FINETUNED_MODELS = { FEATURE_EXTRACT_FINETUNED_MODELS = {
("bert-base-cased", "bert-base-cased", None), ("bert-base-cased", "bert-base-cased", None),
# ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
("distilbert-base-uncased", "distilbert-base-uncased", None), ("distilbert-base-cased", "distilbert-base-cased", None),
} }
TF_FEATURE_EXTRACT_FINETUNED_MODELS = { TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
("bert-base-cased", "bert-base-cased", None), ("bert-base-cased", "bert-base-cased", None),
# ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
("distilbert-base-uncased", "distilbert-base-uncased", None), ("distilbert-base-cased", "distilbert-base-cased", None),
} }
TF_TEXT_CLASSIF_FINETUNED_MODELS = { TF_TEXT_CLASSIF_FINETUNED_MODELS = {