diff --git a/README.md b/README.md index d887d08f5b6..a2aa60cafe0 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'), (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'), - (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), + (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'), (RobertaModel, RobertaTokenizer, 'roberta-base'), (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'), ] diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index e124e414c91..d708054f41f 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -179,6 +179,14 @@ For a list that includes community-uploaded models, refer to `https://huggingfac | | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. | | | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``distilbert-base-cased`` | | 6-layer, 768-hidden, 12-heads, 65M parameters | +| | | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``distilbert-base-cased-distilled-squad`` | | 6-layer, 768-hidden, 12-heads, 65M parameters | +| | | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer. | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``distilgpt2`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | | | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | | | | (see `details `__) | diff --git a/examples/distillation/README.md b/examples/distillation/README.md index 9976d04a0d3..42732389a54 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -31,8 +31,10 @@ Here are the results on the dev sets of GLUE: | Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI | | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: | -| BERT-base-uncased | **77.6** | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1 | -| DistilBERT-base-uncased | **76.8** | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3 | +| BERT-base-uncased | **74.9** | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1 | +| DistilBERT-base-uncased | **74.3** | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3 | +| BERT-base-cased | **78.2** | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5 | +| DistilBERT-base-cased | **75.9** | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | RoBERTa-base (reported) | **83.2**/**86.4**2 | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.73 | | DistilRoBERTa1 | **79.0**/**82.3**2 | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 | @@ -63,7 +65,9 @@ This part of the library has only be tested with Python3.6+. There are few speci Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT): - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters. -- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score). +- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score). +- `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters. +- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score). - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score). - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2. - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base. @@ -72,8 +76,8 @@ Transformers includes five pre-trained Distil* models, currently only provided f Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models. ```python -tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') -model = DistilBertModel.from_pretrained('distilbert-base-uncased') +tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') +model = DistilBertModel.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) outputs = model(input_ids) @@ -81,6 +85,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of ``` Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint: +- DistilBERT uncased: `model = DistilBertModel.from_pretrained('distilbert-base-uncased')` - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')` - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')` - DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')` diff --git a/examples/distillation/training_configs/distilbert-base-cased.json b/examples/distillation/training_configs/distilbert-base-cased.json new file mode 100644 index 00000000000..d4f524d704c --- /dev/null +++ b/examples/distillation/training_configs/distilbert-base-cased.json @@ -0,0 +1,15 @@ +{ + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "n_heads": 12, + "n_layers": 6, + "sinusoidal_pos_embds": true, + "tie_weights_": true, + "vocab_size": 28996 + } + \ No newline at end of file diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index b3386e0ab81..d6fbdbff724 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -25,6 +25,8 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-config.json", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py index a8032f2662e..2ddbaa006a4 100644 --- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -277,7 +277,7 @@ MODEL_CLASSES = { DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), - "distilbert-base-uncased-distilled-squad": ( + "distilbert-base-distilled-squad": ( DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index bbccdcddd7d..6634aacaff6 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -38,6 +38,8 @@ logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-pytorch_model.bin", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-pytorch_model.bin", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin", @@ -440,8 +442,8 @@ class DistilBertModel(DistilBertPreTrainedModel): from transformers import DistilBertTokenizer, DistilBertModel import torch - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = DistilBertModel.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = DistilBertModel.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) @@ -544,8 +546,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): from transformers import DistilBertTokenizer, DistilBertForMaskedLM import torch - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] @@ -619,8 +621,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): from transformers import DistilBertTokenizer, DistilBertForSequenceClassification import torch - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) @@ -711,8 +713,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering import torch - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) @@ -798,8 +800,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): from transformers import DistilBertTokenizer, DistilBertForTokenClassification import torch - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 1dc8301730e..6f6eaa3be0f 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -33,6 +33,8 @@ logger = logging.getLogger(__name__) TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-tf_model.h5", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-distilled-squad-tf_model.h5", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5", "distilbert-base-uncased-finetuned-sst-2-english": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5", } @@ -78,8 +80,6 @@ class TFEmbeddings(tf.keras.layers.Layer): embeddings_initializer=get_initializer(config.initializer_range), name="position_embeddings", ) - if config.sinusoidal_pos_embds: - raise NotImplementedError self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.dropout) @@ -563,8 +563,8 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): import tensorflow as tf from transformers import DistilBertTokenizer, TFDistilBertModel - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = TFDistilBertModel.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -637,8 +637,8 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): import tensorflow as tf from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] @@ -701,8 +701,8 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): import tensorflow as tf from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -759,8 +759,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): import tensorflow as tf from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] @@ -818,8 +818,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): import tensorflow as tf from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') + model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 6f19a8155d4..c3d98545e16 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -941,9 +941,9 @@ SUPPORTED_TASKS = { "tf": TFAutoModel if is_tf_available() else None, "pt": AutoModel if is_torch_available() else None, "default": { - "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased"}, + "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}, "config": None, - "tokenizer": "distilbert-base-uncased", + "tokenizer": "distilbert-base-cased", }, }, "sentiment-analysis": { @@ -978,11 +978,11 @@ SUPPORTED_TASKS = { "pt": AutoModelForQuestionAnswering if is_torch_available() else None, "default": { "model": { - "pt": "distilbert-base-uncased-distilled-squad", - "tf": "distilbert-base-uncased-distilled-squad", + "pt": "distilbert-base-cased-distilled-squad", + "tf": "distilbert-base-cased-distilled-squad", }, "config": None, - "tokenizer": "distilbert-base-uncased", + "tokenizer": "distilbert-base-cased", }, }, "fill-mask": { @@ -1015,7 +1015,7 @@ def pipeline( Examples: pipeline('sentiment-analysis') - pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased') + pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...) pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased') pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased') diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py index 82dbfdb414f..c21a7af0577 100644 --- a/src/transformers/tokenization_distilbert.py +++ b/src/transformers/tokenization_distilbert.py @@ -28,6 +28,8 @@ PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", } @@ -36,6 +38,8 @@ PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "distilbert-base-uncased": 512, "distilbert-base-uncased-distilled-squad": 512, + "distilbert-base-cased": 512, + "distilbert-base-cased-distilled-squad": 512, "distilbert-base-german-cased": 512, "distilbert-base-multilingual-cased": 512, } @@ -44,6 +48,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_INIT_CONFIGURATION = { "distilbert-base-uncased": {"do_lower_case": True}, "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, + "distilbert-base-cased": {"do_lower_case": False}, + "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, "distilbert-base-german-cased": {"do_lower_case": False}, "distilbert-base-multilingual-cased": {"do_lower_case": False}, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 3a4535d1538..1b29842d07b 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -10,13 +10,13 @@ from .utils import require_tf, require_torch QA_FINETUNED_MODELS = { ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), - ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), + ("bert-base-uncased", "distilbert-base-cased-distilled-squad", None), } TF_QA_FINETUNED_MODELS = { ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), - ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), + ("bert-base-uncased", "distilbert-base-cased-distilled-squad", None), } TF_NER_FINETUNED_MODELS = { @@ -38,13 +38,13 @@ NER_FINETUNED_MODELS = { FEATURE_EXTRACT_FINETUNED_MODELS = { ("bert-base-cased", "bert-base-cased", None), # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 - ("distilbert-base-uncased", "distilbert-base-uncased", None), + ("distilbert-base-cased", "distilbert-base-cased", None), } TF_FEATURE_EXTRACT_FINETUNED_MODELS = { ("bert-base-cased", "bert-base-cased", None), # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 - ("distilbert-base-uncased", "distilbert-base-uncased", None), + ("distilbert-base-cased", "distilbert-base-cased", None), } TF_TEXT_CLASSIF_FINETUNED_MODELS = {