diff --git a/.gitignore b/.gitignore index bbc738b931d..d285d0ded93 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,5 @@ runs examples/runs # data -data \ No newline at end of file +data +serialization_dir \ No newline at end of file diff --git a/README.md b/README.md index c473111f2b1..09523401ef6 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ These implementations have been tested on several datasets (see the example scri | Section | Description | |-|-| | [Installation](#installation) | How to install the package | +| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities | | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers | @@ -68,6 +69,14 @@ It contains an example of a conversion script from a Pytorch trained Transformer At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! +## Online demo + +**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities. +You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`. + +> “🦄 Write with transformer is to writing what calculators are to calculus.” + +![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png) ## Quick tour diff --git a/docs/README.md b/docs/README.md index 1b3c1feade6..6804a22e696 100644 --- a/docs/README.md +++ b/docs/README.md @@ -34,6 +34,13 @@ pip install recommonmark ## Building the documentation +Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig +command to generate it: + +```bash +ln -s ../../examples/README.md source/examples.md +``` + Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder: ```bash diff --git a/docs/requirements.txt b/docs/requirements.txt index 112beb3f72b..0c2a31c09ac 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -26,3 +26,4 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.2 sphinxcontrib-serializinghtml==1.1.3 urllib3==1.25.3 +sphinx-markdown-tables==0.0.9 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index cdca1d82d0c..c847dee8066 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,7 +43,8 @@ extensions = [ 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'recommonmark', - 'sphinx.ext.viewcode' + 'sphinx.ext.viewcode', + 'sphinx_markdown_tables' ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/examples.rst b/docs/source/examples.rst deleted file mode 100644 index d9784514382..00000000000 --- a/docs/source/examples.rst +++ /dev/null @@ -1,686 +0,0 @@ -examples.rst - -Examples -================================================ - -.. list-table:: - :header-rows: 1 - - * - Sub-section - - Description - * - `Training large models: introduction, tools and examples <#introduction>`_ - - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models - * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_ - - Running the examples in `examples `_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py`` - * - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_ - - Running the examples in `examples `_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py`` - * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_ - - How to fine tune ``BERT large`` - - -.. _introduction: - -Training large models: introduction, tools and examples -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32). - -To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py `_ and `run_bert_squad.py `_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch `_ that I published earlier this year. - -Here is how to use these techniques in our scripts: - - -* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps. -* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs. -* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below). -* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here `__ and a full documentation is `here `__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static. - -To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here `__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository `_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository `_. - -Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post `_\ ) for more details): - -.. code-block:: bash - - python -m torch.distributed.launch \ - --nproc_per_node=4 \ - --nnodes=2 \ - --node_rank=$THIS_MACHINE_INDEX \ - --master_addr="192.168.1.1" \ - --master_port=1234 run_bert_classifier.py \ - (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script) - -Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``. - -.. _fine-tuning-bert-examples: - -Fine-tuning with BERT: running the examples -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We showcase several fine-tuning examples based on (and extended from) `the original implementation `_\ : - - -* a *sequence-level classifier* on nine different GLUE tasks, -* a *token-level classifier* on the question answering dataset SQuAD, and -* a *sequence-level multiple-choice classifier* on the SWAG classification corpus. -* a *BERT language model* on another target corpus - -GLUE results on dev set -~~~~~~~~~~~~~~~~~~~~~~~ - -We get the following results on the dev set of GLUE benchmark with an uncased BERT base -model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of -these tasks have a small dataset and training can lead to high variance in the results between different runs. -We report the median on 5 runs (with different seeds) for each of the metrics. - -.. list-table:: - :header-rows: 1 - - * - Task - - Metric - - Result - * - CoLA - - Matthew's corr. - - 55.75 - * - SST-2 - - accuracy - - 92.09 - * - MRPC - - F1/accuracy - - 90.48/86.27 - * - STS-B - - Pearson/Spearman corr. - - 89.03/88.64 - * - QQP - - accuracy/F1 - - 90.92/87.72 - * - MNLI - - matched acc./mismatched acc. - - 83.74/84.06 - * - QNLI - - accuracy - - 91.07 - * - RTE - - accuracy - - 68.59 - * - WNLI - - accuracy - - 43.66 - - -Some of these results are significantly different from the ones reported on the test set -of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 `_ on the webite. - -Before running anyone of these GLUE tasks you should download the -`GLUE data `_ by running -`this script `_ -and unpack it to some directory ``$GLUE_DIR``. - -.. code-block:: shell - - export GLUE_DIR=/path/to/glue - export TASK_NAME=MRPC - - python run_bert_classifier.py \ - --task_name $TASK_NAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/$TASK_NAME \ - --bert_model bert-base-uncased \ - --max_seq_length 128 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/$TASK_NAME/ - -where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI. - -The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'. - -The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor. - -MRPC -~~~~ - -This example code fine-tunes BERT on the Microsoft Research Paraphrase -Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed. - -Before running this example you should download the -`GLUE data `_ by running -`this script `_ -and unpack it to some directory ``$GLUE_DIR``. - -.. code-block:: shell - - export GLUE_DIR=/path/to/glue - - python run_bert_classifier.py \ - --task_name MRPC \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/MRPC/ \ - --bert_model bert-base-uncased \ - --max_seq_length 128 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/mrpc_output/ - -Our test ran on a few seeds with `the original implementation hyper-parameters `__ gave evaluation results between 84% and 88%. - -**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!** -First install apex as indicated `here `__. -Then run - -.. code-block:: shell - - export GLUE_DIR=/path/to/glue - - python run_bert_classifier.py \ - --task_name MRPC \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/MRPC/ \ - --bert_model bert-base-uncased \ - --max_seq_length 128 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/mrpc_output/ \ - --fp16 - -**Distributed training** -Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC: - -.. code-block:: bash - - python -m torch.distributed.launch \ - --nproc_per_node 8 run_bert_classifier.py \ - --bert_model bert-large-uncased-whole-word-masking \ - --task_name MRPC \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/MRPC/ \ - --max_seq_length 128 \ - --train_batch_size 8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/mrpc_output/ - -Training with these hyper-parameters gave us the following results: - -.. code-block:: bash - - acc = 0.8823529411764706 - acc_and_f1 = 0.901702786377709 - eval_loss = 0.3418912578906332 - f1 = 0.9210526315789473 - global_step = 174 - loss = 0.07231863956341798 - -Here is an example on MNLI: - -.. code-block:: bash - - python -m torch.distributed.launch \ - --nproc_per_node 8 run_bert_classifier.py \ - --bert_model bert-large-uncased-whole-word-masking \ - --task_name mnli \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /datadrive/bert_data/glue_data//MNLI/ \ - --max_seq_length 128 \ - --train_batch_size 8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir ../models/wwm-uncased-finetuned-mnli/ \ - --overwrite_output_dir - -.. code-block:: bash - - ***** Eval results ***** - acc = 0.8679706601466992 - eval_loss = 0.4911287787382479 - global_step = 18408 - loss = 0.04755385363816904 - - ***** Eval results ***** - acc = 0.8747965825874695 - eval_loss = 0.45516540421714036 - global_step = 18408 - loss = 0.04755385363816904 - -This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model - -SQuAD -~~~~~ - -This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB. - -The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory. - - -* `train-v1.1.json `_ -* `dev-v1.1.json `_ -* `evaluate-v1.1.py `_ - -.. code-block:: shell - - export SQUAD_DIR=/path/to/SQUAD - - python run_bert_squad.py \ - --bert_model bert-base-uncased \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --train_batch_size 12 \ - --learning_rate 3e-5 \ - --num_train_epochs 2.0 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ - -Training with the previous hyper-parameters gave us the following results: - -.. code-block:: bash - - python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json - {"f1": 88.52381567990474, "exact_match": 81.22043519394512} - -**distributed training** - -Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD: - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node=8 \ - run_bert_squad.py \ - --bert_model bert-large-uncased-whole-word-masking \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir ../models/wwm_uncased_finetuned_squad/ \ - --train_batch_size 24 \ - --gradient_accumulation_steps 12 - -Training with these hyper-parameters gave us the following results: - -.. code-block:: bash - - python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json - {"exact_match": 86.91579943235573, "f1": 93.1532499015869} - -This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``. - -And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ : - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py \ - --bert_model bert-large-cased-whole-word-masking \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir ../models/wwm_cased_finetuned_squad/ \ - --train_batch_size 24 \ - --gradient_accumulation_steps 12 - -Training with these hyper-parameters gave us the following results: - -.. code-block:: bash - - python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json - {"exact_match": 84.18164616840113, "f1": 91.58645594850135} - -SWAG -~~~~ - -The data for SWAG can be downloaded by cloning the following `repository `_ - -.. code-block:: shell - - export SWAG_DIR=/path/to/SWAG - - python run_bert_swag.py \ - --bert_model bert-base-uncased \ - --do_train \ - --do_lower_case \ - --do_eval \ - --data_dir $SWAG_DIR/data \ - --train_batch_size 16 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --max_seq_length 80 \ - --output_dir /tmp/swag_output/ \ - --gradient_accumulation_steps 4 - -Training with the previous hyper-parameters on a single GPU gave us the following results: - -.. code-block:: - - eval_accuracy = 0.8062081375587323 - eval_loss = 0.5966546792367169 - global_step = 13788 - loss = 0.06423990014260186 - -LM Fine-tuning -~~~~~~~~~~~~~~ - -The data should be a text file in the same format as `sample_text.txt <./pytorch_transformers/tests/fixtures/sample_text.txt/sample_text.txt>`_ (one sentence per line, docs separated by empty line). -You can download an `exemplary training corpus `_ generated from wikipedia articles and split into ~500k sentences with spaCy. -Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ : - -Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README `_ of the `examples/lm_finetuning/ `_ folder. - -.. _fine-tuning: - -OpenAI GPT, Transformer-XL and GPT-2: running the examples -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations: - - -* fine-tuning OpenAI GPT on the ROCStories dataset -* evaluating Transformer-XL on Wikitext 103 -* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model -* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task - -Fine-tuning OpenAI GPT on the RocStories dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example code fine-tunes OpenAI GPT on the RocStories dataset. - -Before running this example you should download the -`RocStories dataset `_ and unpack it to some directory ``$ROC_STORIES_DIR``. - -.. code-block:: shell - - export ROC_STORIES_DIR=/path/to/RocStories - - python run_openai_gpt.py \ - --model_name openai-gpt \ - --do_train \ - --do_eval \ - --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \ - --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \ - --output_dir ../log \ - --train_batch_size 16 \ - -This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%). - -Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset. -This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed. - -.. code-block:: shell - - python run_transfo_xl.py --work_dir ../log - -This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code). - -Unconditional and conditional generation from OpenAI's GPT-2 model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This example code is identical to the original unconditional and conditional generation codes. - -Conditional generation: - -.. code-block:: shell - - python run_gpt2.py - -Unconditional generation: - -.. code-block:: shell - - python run_gpt2.py --unconditional - -The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI. - - -Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before running the following examples you should download the `WikiText-2 dataset `__ and unpack it to some directory `$WIKITEXT_2_DATASET` -The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization). - -This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity). - -.. code-block:: bash - - - export WIKITEXT_2_DATASET=/path/to/wikitext_dataset - - python run_lm_finetuning.py - --output_dir=output - --model_type=gpt2 - --model_name_or_path=gpt2 - --do_train - --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw - --do_eval - --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw - -This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. -It reaches a score of about 20 perplexity once fine-tuned on the dataset. - -This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity). -The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling. - -.. code-block:: bash - - - export WIKITEXT_2_DATASET=/path/to/wikitext_dataset - - python run_lm_finetuning.py - --output_dir=output - --model_type=roberta - --model_name_or_path=roberta-base - --do_train - --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw - --do_eval - --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw - --mlm - -.. _fine-tuning-BERT-large: - -Fine-tuning BERT-large on GPUs ------------------------------- - -The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation. - -For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher): - -.. code-block:: bash - - {"exact_match": 84.56953642384106, "f1": 91.04028647786927} - -To get these results we used a combination of: - - -* multi-GPU training (automatically activated on a multi-GPU server), -* 2 steps of gradient accumulation and -* perform the optimization step on CPU to store Adam's averages in RAM. - -Here is the full list of hyper-parameters for this run: - -.. code-block:: bash - - export SQUAD_DIR=/path/to/SQUAD - - python ./run_bert_squad.py \ - --bert_model bert-large-uncased \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ \ - --train_batch_size 24 \ - --gradient_accumulation_steps 2 - -If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16). - -Here is an example of hyper-parameters for a FP16 run we tried: - -.. code-block:: bash - - export SQUAD_DIR=/path/to/SQUAD - - python ./run_bert_squad.py \ - --bert_model bert-large-uncased \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ \ - --train_batch_size 24 \ - --fp16 \ - --loss_scale 128 - -The results were similar to the above FP32 results (actually slightly higher): - -.. code-block:: bash - - {"exact_match": 84.65468306527909, "f1": 91.238669287002} - -Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ : - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node=8 \ - run_bert_squad.py \ - --bert_model bert-large-uncased-whole-word-masking \ - --do_train \ - --do_predict \ - --do_lower_case \ - --train_file $SQUAD_DIR/train-v1.1.json \ - --predict_file $SQUAD_DIR/dev-v1.1.json \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/debug_squad/ \ - --train_batch_size 24 \ - --gradient_accumulation_steps 2 - -Fine-tuning XLNet ------------------ - -STS-B -~~~~~ - -This example code fine-tunes XLNet on the STS-B corpus. - -Before running this example you should download the -`GLUE data `_ by running -`this script `_ -and unpack it to some directory ``$GLUE_DIR``. - -.. code-block:: shell - - export GLUE_DIR=/path/to/glue - - python run_xlnet_classifier.py \ - --task_name STS-B \ - --do_train \ - --do_eval \ - --data_dir $GLUE_DIR/STS-B/ \ - --max_seq_length 128 \ - --train_batch_size 8 \ - --gradient_accumulation_steps 1 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/mrpc_output/ - -Our test ran on a few seeds with `the original implementation hyper-parameters `__ gave evaluation results between 84% and 88%. - -**Distributed training** -Here is an example using distributed training on 8 V100 GPUs to reach XXXX: - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node 8 \ - run_xlnet_classifier.py \ - --task_name STS-B \ - --do_train \ - --do_eval \ - --data_dir $GLUE_DIR/STS-B/ \ - --max_seq_length 128 \ - --train_batch_size 8 \ - --gradient_accumulation_steps 1 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/mrpc_output/ - -Training with these hyper-parameters gave us the following results: - -.. code-block:: bash - - acc = 0.8823529411764706 - acc_and_f1 = 0.901702786377709 - eval_loss = 0.3418912578906332 - f1 = 0.9210526315789473 - global_step = 174 - loss = 0.07231863956341798 - -Here is an example on MNLI: - -.. code-block:: bash - - python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \ - --bert_model bert-large-uncased-whole-word-masking \ - --task_name mnli \ - --do_train \ - --do_eval \ - --data_dir /datadrive/bert_data/glue_data//MNLI/ \ - --max_seq_length 128 \ - --train_batch_size 8 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir ../models/wwm-uncased-finetuned-mnli/ \ - --overwrite_output_dir - -.. code-block:: bash - - ***** Eval results ***** - acc = 0.8679706601466992 - eval_loss = 0.4911287787382479 - global_step = 18408 - loss = 0.04755385363816904 - - ***** Eval results ***** - acc = 0.8747965825874695 - eval_loss = 0.45516540421714036 - global_step = 18408 - loss = 0.04755385363816904 - -This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000000..c47dc41433a --- /dev/null +++ b/examples/README.md @@ -0,0 +1,357 @@ +# Examples + +In this section a few examples are put together. All of these examples work for several models, making use of the very +similar API between the different models. + +| Section | Description | +|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. | +| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. | +| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. | +| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. | + +## Language model fine-tuning + +Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py). + +Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT +to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa +are fine-tuned using a masked language modeling (MLM) loss. + +Before running the following example, you should get a file that contains text on which the language model will be +fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/). + +We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains +text that will be used for evaluation. + +### GPT-2/GPT and causal language modeling + +The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before +the tokenization). The loss here is that of causal language modeling. + +```bash +export TRAIN_FILE=/path/to/dataset/wiki.train.raw +export TEST_FILE=/path/to/dataset/wiki.test.raw + +python run_lm_finetuning.py \ + --output_dir=output \ + --model_type=gpt2 \ + --model_name_or_path=gpt2 \ + --do_train \ + --train_data_file=$TRAIN_FILE \ + --do_eval \ + --eval_data_file=$TEST_FILE +``` + +This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches +a score of ~20 perplexity once fine-tuned on the dataset. + +### RoBERTa/BERT and masked language modeling + +The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different +as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their +pre-training: masked language modeling. + +In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge +slightly slower (over-fitting takes more epochs). + +We use the `--mlm` flag so that the script may change its loss function. + +```bash +export TRAIN_FILE=/path/to/dataset/wiki.train.raw +export TEST_FILE=/path/to/dataset/wiki.test.raw + +python run_lm_finetuning.py \ + --output_dir=output \ + --model_type=roberta \ + --model_name_or_path=roberta-base \ + --do_train \ + --train_data_file=$TRAIN_FILE \ + --do_eval \ + --eval_data_file=$TEST_FILE \ + --mlm +``` + +## Language generation + +Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py). + +Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. +A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you +can try out the different models available in the library. + +Example usage: + +```bash +python run_generation.py \ + --model_type=gpt2 \ + --model_name_or_path=gpt2 +``` + +## GLUE + +Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py). + +Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding +Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. + +GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an +uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train +batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results +between different runs. We report the median on 5 runs (with different seeds) for each of the metrics. + +| Task | Metric | Result | +|-------|------------------------------|-------------| +| CoLA | Matthew's corr | 55.75 | +| SST-2 | Accuracy | 92.09 | +| MRPC | F1/Accuracy | 90.48/86.27 | +| STS-B | Person/Spearman corr. | 89.03/88.64 | +| QQP | Accuracy/F1 | 90.92/87.72 | +| MNLI | Matched acc./Mismatched acc. | 83.74/84.06 | +| QNLI | Accuracy | 91.07 | +| RTE | Accuracy | 68.59 | +| WNLI | Accuracy | 43.66 | + +Some of these results are significantly different from the ones reported on the test set +of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite. + +Before running anyone of these GLUE tasks you should download the +[GLUE data](https://gluebenchmark.com/tasks) by running +[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) +and unpack it to some directory `$GLUE_DIR`. + +```bash +export GLUE_DIR=/path/to/glue +export TASK_NAME=MRPC + +python run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/$TASK_NAME \ + --max_seq_length 128 \ + --per_gpu_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/$TASK_NAME/ +``` + +where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI. + +The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. +In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate +output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`. + +The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, +CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being +said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, +since the data processor for each task inherits from the base class DataProcessor. + +### MRPC + +#### Fine-tuning example + +The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less +than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed. + +Before running anyone of these GLUE tasks you should download the +[GLUE data](https://gluebenchmark.com/tasks) by running +[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) +and unpack it to some directory `$GLUE_DIR`. + +```bash +export GLUE_DIR=/path/to/glue + +python run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --task_name MRPC \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MRPC/ \ + --max_seq_length 128 \ + --per_gpu_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/mrpc_output/ +``` + +Our test ran on a few seeds with [the original implementation hyper- +parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation +results between 84% and 88%. + +#### Using Apex and mixed-precision + +Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install +[apex](https://github.com/NVIDIA/apex), then run the following example: + +```bash +export GLUE_DIR=/path/to/glue + +python run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --task_name MRPC \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MRPC/ \ + --max_seq_length 128 \ + --per_gpu_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/mrpc_output/ \ + --fp16 +``` + +#### Distributed training + +Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it +reaches F1 > 92 on MRPC. + +```bash +export GLUE_DIR=/path/to/glue + +python -m torch.distributed.launch \ + --nproc_per_node 8 run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --task_name MRPC \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MRPC/ \ + --max_seq_length 128 \ + --per_gpu_train_batch_size 8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/mrpc_output/ +``` + +Training with these hyper-parameters gave us the following results: + +```bash +acc = 0.8823529411764706 +acc_and_f1 = 0.901702786377709 +eval_loss = 0.3418912578906332 +f1 = 0.9210526315789473 +global_step = 174 +loss = 0.07231863956341798 +``` + +### MNLI + +The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task. + +```bash +export GLUE_DIR=/path/to/glue + +python -m torch.distributed.launch \ + --nproc_per_node 8 run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --task_name mnli \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MNLI/ \ + --max_seq_length 128 \ + --per_gpu_train_batch_size 8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir output_dir \ +``` + +The results are the following: + +```bash +***** Eval results ***** + acc = 0.8679706601466992 + eval_loss = 0.4911287787382479 + global_step = 18408 + loss = 0.04755385363816904 + +***** Eval results ***** + acc = 0.8747965825874695 + eval_loss = 0.45516540421714036 + global_step = 18408 + loss = 0.04755385363816904 +``` + +## SQuAD + +Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py). + +#### Fine-tuning on SQuAD + +This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) +on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a +$SQUAD_DIR directory. + +* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json) +* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json) +* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py) + +```bash +export SQUAD_DIR=/path/to/SQUAD + +python run_squad.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --do_train \ + --do_eval \ + --do_lower_case \ + --train_file $SQUAD_DIR/train-v1.1.json \ + --predict_file $SQUAD_DIR/dev-v1.1.json \ + --per_gpu_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2.0 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ +``` + +Training with the previously defined hyper-parameters yields the following results: + +```bash +f1 = 88.52 +exact_match = 81.22 +``` + +#### Distributed training + + +Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD: + +```bash +python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ + --model_type bert \ + --model_name_or_path bert-base-cased \ + --do_train \ + --do_eval \ + --do_lower_case \ + --train_file $SQUAD_DIR/train-v1.1.json \ + --predict_file $SQUAD_DIR/dev-v1.1.json \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir ../models/wwm_uncased_finetuned_squad/ \ + --per_gpu_train_batch_size 24 \ + --gradient_accumulation_steps 12 +``` + +Training with the previously defined hyper-parameters yields the following results: + +```bash +f1 = 93.15 +exact_match = 86.91 +``` + +This fine-tuneds model is available as a checkpoint under the reference +`bert-large-uncased-whole-word-masking-finetuned-squad`. + diff --git a/examples/distillation/README.md b/examples/distillation/README.md index bb919385f17..73e0cc06559 100644 --- a/examples/distillation/README.md +++ b/examples/distillation/README.md @@ -9,6 +9,12 @@ DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and l For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5 ). +## Setup + +This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. + +**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details. + ## How to use DistilBERT PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT): diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index 38769c4b0ec..ed710a2bee6 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -17,6 +17,7 @@ """ import os import math +import psutil from tensorboardX import SummaryWriter from tqdm import trange, tqdm import numpy as np @@ -192,7 +193,7 @@ class Distiller: x_prob = self.token_probs[token_ids.flatten()] n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item()) tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False) - pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device) + pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility pred_mask[tgt_ids] = 1 pred_mask = pred_mask.view(bs, max_seq_len) @@ -216,7 +217,7 @@ class Distiller: _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long() token_ids = token_ids.masked_scatter(pred_mask, _token_ids) - mlm_labels[1-pred_mask] = -1 + mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility return token_ids, attn_mask, mlm_labels @@ -379,9 +380,9 @@ class Distiller: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm) - self.scheduler.step() self.optimizer.step() self.optimizer.zero_grad() + self.scheduler.step() def iter(self): """ @@ -418,6 +419,8 @@ class Distiller: if self.alpha_mse > 0.: self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter) + + self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter) def end_epoch(self): """ diff --git a/examples/distillation/requirements.txt b/examples/distillation/requirements.txt index efb369dc438..18146239eb4 100644 --- a/examples/distillation/requirements.txt +++ b/examples/distillation/requirements.txt @@ -1 +1,4 @@ gitpython==3.0.2 +tensorboard>=1.14.0 +tensorboardX==1.8 +psutil==5.6.3 diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index d37f7a443a3..a1995ae2242 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa). +Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned using a masked language modeling (MLM) loss. """ diff --git a/hubconf.py b/hubconf.py index 35e7f1eea83..d9aaa6b53aa 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,7 +1,7 @@ from pytorch_transformers import ( AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering ) -from pytorch_transformers.modeling_utils import add_start_docstrings +from pytorch_transformers.file_utils import add_start_docstrings dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses'] diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index 3e8719bd8d7..24ff52e5d4d 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -1,4 +1,18 @@ __version__ = "1.2.0" +# Work around to update TensorFlow's absl.logging threshold which alters the +# default Python logging output behavior when present. +# see: https://github.com/abseil/abseil-py/issues/99 +# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 +try: + import absl.logging + absl.logging.set_verbosity('info') + absl.logging.set_stderrthreshold('info') + absl.logging._warn_preinit_stderr = False +except: + pass + +# Tokenizer +from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_auto import AutoTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .tokenization_openai import OpenAIGPTTokenizer @@ -9,46 +23,53 @@ from .tokenization_xlm import XLMTokenizer from .tokenization_roberta import RobertaTokenizer from .tokenization_distilbert import DistilBertTokenizer -from .tokenization_utils import (PreTrainedTokenizer) +# Configurations +from .configuration_utils import PretrainedConfig +from .configuration_auto import AutoConfig +from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, +# Modeling +from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) +from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelWithLMHead) -from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, +from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering, - load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) -from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel, + load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, - load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, - OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, - load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, - TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model, + load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, + load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, - load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, - GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_xlnet import (XLNetConfig, - XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, + load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, - load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, + load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_xlm import (XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, - XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLM_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, - ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel, + XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) +from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, - DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, - PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) +# Optimization from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) -from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path) +# Files and general utilities +from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, + cached_path, add_start_docstrings, add_end_docstrings, + WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME) diff --git a/pytorch_transformers/configuration_auto.py b/pytorch_transformers/configuration_auto.py new file mode 100644 index 00000000000..9e35f85dc74 --- /dev/null +++ b/pytorch_transformers/configuration_auto.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Model class. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from .configuration_bert import BertConfig +from .configuration_openai import OpenAIGPTConfig +from .configuration_gpt2 import GPT2Config +from .configuration_transfo_xl import TransfoXLConfig +from .configuration_xlnet import XLNetConfig +from .configuration_xlm import XLMConfig +from .configuration_roberta import RobertaConfig +from .configuration_distilbert import DistilBertConfig + +logger = logging.getLogger(__name__) + + +class AutoConfig(object): + r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class + that will be instantiated as one of the configuration classes of the library + when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method take care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The base model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertConfig (DistilBERT model) + - contains `bert`: BertConfig (Bert model) + - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) + - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) + - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) + - contains `xlnet`: XLNetConfig (XLNet model) + - contains `xlm`: XLMConfig (XLM model) + - contains `roberta`: RobertaConfig (RoBERTa model) + + This class cannot be instantiated using `__init__()` (throw an error). + """ + def __init__(self): + raise EnvironmentError("AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiate a one of the configuration classes of the library + from a pre-trained model configuration. + + The configuration class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertConfig (DistilBERT model) + - contains `bert`: BertConfig (Bert model) + - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) + - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) + - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) + - contains `xlnet`: XLNetConfig (XLNet model) + - contains `xlm`: XLMConfig (XLM model) + - contains `roberta`: RobertaConfig (RoBERTa model) + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + return_unused_kwargs: (`optional`) bool: + + - If False, then this function returns just the final configuration object. + - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + + Examples:: + + config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') + config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + assert config.output_attention == True + config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, + foo=False, return_unused_kwargs=True) + assert config.output_attention == True + assert unused_kwargs == {'foo': False} + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'openai-gpt' in pretrained_model_name_or_path: + return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'gpt2' in pretrained_model_name_or_path: + return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'transfo-xl' in pretrained_model_name_or_path: + return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta'".format(pretrained_model_name_or_path)) diff --git a/pytorch_transformers/configuration_bert.py b/pytorch_transformers/configuration_bert.py new file mode 100644 index 00000000000..7fff3e5d058 --- /dev/null +++ b/pytorch_transformers/configuration_bert.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", +} + + +class BertConfig(PretrainedConfig): + r""" + :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a + `BertModel`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs): + super(BertConfig, self).__init__(**kwargs) + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") diff --git a/pytorch_transformers/configuration_distilbert.py b/pytorch_transformers/configuration_distilbert.py new file mode 100644 index 00000000000..b8929eedec7 --- /dev/null +++ b/pytorch_transformers/configuration_distilbert.py @@ -0,0 +1,89 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DistilBERT model configuration """ +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" +} + + +class DistilBertConfig(PretrainedConfig): + pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + max_position_embeddings=512, + sinusoidal_pos_embds=True, + n_layers=6, + n_heads=12, + dim=768, + hidden_dim=4*768, + dropout=0.1, + attention_dropout=0.1, + activation='gelu', + initializer_range=0.02, + tie_weights_=True, + qa_dropout=0.1, + seq_classif_dropout=0.2, + **kwargs): + super(DistilBertConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.max_position_embeddings = max_position_embeddings + self.sinusoidal_pos_embds = sinusoidal_pos_embds + self.n_layers = n_layers + self.n_heads = n_heads + self.dim = dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation = activation + self.initializer_range = initializer_range + self.tie_weights_ = tie_weights_ + self.qa_dropout = qa_dropout + self.seq_classif_dropout = seq_classif_dropout + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + @property + def hidden_size(self): + return self.dim + + @property + def num_attention_heads(self): + return self.n_heads + + @property + def num_hidden_layers(self): + return self.n_layers diff --git a/pytorch_transformers/configuration_gpt2.py b/pytorch_transformers/configuration_gpt2.py new file mode 100644 index 00000000000..c83d9e82cef --- /dev/null +++ b/pytorch_transformers/configuration_gpt2.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT-2 configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"} + +class GPT2Config(PretrainedConfig): + """Configuration class to store the configuration of a `GPT2Model`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__( + self, + vocab_size_or_config_json_file=50257, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): + """Constructs GPT2Config. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + super(GPT2Config, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/pytorch_transformers/configuration_openai.py b/pytorch_transformers/configuration_openai.py new file mode 100644 index 00000000000..b27df568998 --- /dev/null +++ b/pytorch_transformers/configuration_openai.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" +} + +class OpenAIGPTConfig(PretrainedConfig): + """ + Configuration class to store the configuration of a `OpenAIGPTModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. + n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + afn: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + layer_norm_epsilon: epsilon to use in the layer norm layers + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + predict_special_tokens: should we predict special tokens (when the model has a LM head) + """ + pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__( + self, + vocab_size_or_config_json_file=40478, + n_positions=512, + n_ctx=512, + n_embd=768, + n_layer=12, + n_head=12, + afn="gelu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + predict_special_tokens=True, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): + """Constructs OpenAIGPTConfig. + """ + super(OpenAIGPTConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.afn = afn + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.predict_special_tokens = predict_special_tokens + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/pytorch_transformers/configuration_roberta.py b/pytorch_transformers/configuration_roberta.py new file mode 100644 index 00000000000..b92d6a908ba --- /dev/null +++ b/pytorch_transformers/configuration_roberta.py @@ -0,0 +1,35 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" RoBERTa configuration """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +from .configuration_bert import BertConfig + +logger = logging.getLogger(__name__) + +ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", +} + + +class RobertaConfig(BertConfig): + pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/pytorch_transformers/configuration_transfo_xl.py b/pytorch_transformers/configuration_transfo_xl.py new file mode 100644 index 00000000000..2e966ee55cf --- /dev/null +++ b/pytorch_transformers/configuration_transfo_xl.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Transformer XL configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", +} + +class TransfoXLConfig(PretrainedConfig): + """Configuration class to store the configuration of a `TransfoXLModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. + cutoffs: cutoffs for the adaptive softmax + d_model: Dimensionality of the model's hidden states. + d_embed: Dimensionality of the embeddings + d_head: Dimensionality of the model's heads. + div_val: divident value for adapative input and softmax + pre_lnorm: apply LayerNorm to the input instead of the output + d_inner: Inner dimension in FF + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + tgt_len: number of tokens to predict + ext_len: length of the extended context + mem_len: length of the retained previous heads + same_length: use the same attn length for all tokens + proj_share_all_but_first: True to share all but first projs, False not to share. + attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + clamp_len: use the same pos embeddings after clamp_len + sample_softmax: number of samples in sampled softmax + adaptive: use adaptive softmax + tie_weight: tie the word embedding and softmax weights + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + dropatt: The dropout ratio for the attention probabilities. + untie_r: untie relative position biases + embd_pdrop: The dropout ratio for the embeddings. + init: parameter initializer to use + init_range: parameters initialized by U(-init_range, init_range). + proj_init_std: parameters initialized by N(0, init_std) + init_std: parameters initialized by N(0, init_std) + """ + pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=267735, + cutoffs=[20000, 40000, 200000], + d_model=1024, + d_embed=1024, + n_head=16, + d_head=64, + d_inner=4096, + div_val=4, + pre_lnorm=False, + n_layer=18, + tgt_len=128, + ext_len=0, + mem_len=1600, + clamp_len=1000, + same_length=True, + proj_share_all_but_first=True, + attn_type=0, + sample_softmax=-1, + adaptive=True, + tie_weight=True, + dropout=0.1, + dropatt=0.0, + untie_r=True, + init="normal", + init_range=0.01, + proj_init_std=0.01, + init_std=0.02, + **kwargs): + """Constructs TransfoXLConfig. + """ + super(TransfoXLConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.n_token = vocab_size_or_config_json_file + self.cutoffs = [] + self.cutoffs.extend(cutoffs) + self.tie_weight = tie_weight + if proj_share_all_but_first: + self.tie_projs = [False] + [True] * len(self.cutoffs) + else: + self.tie_projs = [False] + [False] * len(self.cutoffs) + self.d_model = d_model + self.d_embed = d_embed + self.d_head = d_head + self.d_inner = d_inner + self.div_val = div_val + self.pre_lnorm = pre_lnorm + self.n_layer = n_layer + self.n_head = n_head + self.tgt_len = tgt_len + self.ext_len = ext_len + self.mem_len = mem_len + self.same_length = same_length + self.attn_type = attn_type + self.clamp_len = clamp_len + self.sample_softmax = sample_softmax + self.adaptive = adaptive + self.dropout = dropout + self.dropatt = dropatt + self.untie_r = untie_r + self.init = init + self.init_range = init_range + self.proj_init_std = proj_init_std + self.init_std = init_std + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def max_position_embeddings(self): + return self.tgt_len + self.ext_len + self.mem_len + + @property + def vocab_size(self): + return self.n_token + + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + + @property + def hidden_size(self): + return self.d_model + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py new file mode 100644 index 00000000000..7efc735d413 --- /dev/null +++ b/pytorch_transformers/configuration_utils.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Configuration base class and utilities.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import copy +import json +import logging +import os +from io import open + +from .file_utils import cached_path, CONFIG_NAME + +logger = logging.getLogger(__name__) + +class PretrainedConfig(object): + r""" Base class for all configuration classes. + Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. + + Note: + A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. + It only affects the model's configuration. + + Class attributes (overridden by derived classes): + - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values. + + Parameters: + ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. + ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens) + ``output_attentions``: boolean, default `False`. Should the model returns attentions weights. + ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states. + ``torchscript``: string, default `False`. Is the model used with Torchscript. + """ + pretrained_config_archive_map = {} + + def __init__(self, **kwargs): + self.finetuning_task = kwargs.pop('finetuning_task', None) + self.num_labels = kwargs.pop('num_labels', 2) + self.output_attentions = kwargs.pop('output_attentions', False) + self.output_hidden_states = kwargs.pop('output_hidden_states', False) + self.torchscript = kwargs.pop('torchscript', False) + self.pruned_heads = kwargs.pop('pruned_heads', {}) + + def save_pretrained(self, save_directory): + """ Save a configuration object to the directory `save_directory`, so that it + can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method. + """ + assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. + + Parameters: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + return_unused_kwargs: (`optional`) bool: + + - If False, then this function returns just the final configuration object. + - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + + Examples:: + + # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a + # derived class: BertConfig + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') + config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + assert config.output_attention == True + config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, + foo=False, return_unused_kwargs=True) + assert config.output_attention == True + assert unused_kwargs == {'foo': False} + + """ + cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) + return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) + + if pretrained_model_name_or_path in cls.pretrained_config_archive_map: + config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] + elif os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + else: + config_file = pretrained_model_name_or_path + # redirect to the cache, if necessary + try: + resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) + except EnvironmentError as e: + if pretrained_model_name_or_path in cls.pretrained_config_archive_map: + logger.error( + "Couldn't reach server at '{}' to download pretrained model configuration file.".format( + config_file)) + else: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ', '.join(cls.pretrained_config_archive_map.keys()), + config_file)) + raise e + if resolved_config_file == config_file: + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + + # Load config + config = cls.from_json_file(resolved_config_file) + + if hasattr(config, 'pruned_heads'): + config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items()) + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info("Model config %s", config) + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def from_dict(cls, json_object): + """Constructs a `Config` from a Python dictionary of parameters.""" + config = cls(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) diff --git a/pytorch_transformers/configuration_xlm.py b/pytorch_transformers/configuration_xlm.py new file mode 100644 index 00000000000..ab251c8939e --- /dev/null +++ b/pytorch_transformers/configuration_xlm.py @@ -0,0 +1,184 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" XLM configuration """ +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", +} + + +class XLMConfig(PretrainedConfig): + """Configuration class to store the configuration of a `XLMModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. + d_model: Size of the encoder layers and the pooler layer. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + d_inner: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + ff_activation: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r: untie relative position biases + attn_type: 'bi' for XLM, 'uni' for Transformer-XL + + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + dropatt: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + + dropout: float, dropout rate. + dropatt: float, dropout rate on attention probabilities. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + """ + pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30145, + emb_dim=2048, + n_layers=12, + n_heads=16, + dropout=0.1, + attention_dropout=0.1, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=1, + use_lang_emb=True, + max_position_embeddings=512, + embed_init_std=2048 ** -0.5, + layer_norm_eps=1e-12, + init_std=0.02, + bos_index=0, + eos_index=1, + pad_index=2, + unk_index=3, + mask_index=5, + is_encoder=True, + + finetuning_task=None, + num_labels=2, + summary_type='first', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + start_n_top=5, + end_n_top=5, + **kwargs): + """Constructs XLMConfig. + """ + super(XLMConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.n_words = vocab_size_or_config_json_file + self.emb_dim = emb_dim + self.n_layers = n_layers + self.n_heads = n_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.use_lang_emb = use_lang_emb + self.layer_norm_eps = layer_norm_eps + self.bos_index = bos_index + self.eos_index = eos_index + self.pad_index = pad_index + self.unk_index = unk_index + self.mask_index = mask_index + self.is_encoder = is_encoder + self.max_position_embeddings = max_position_embeddings + self.embed_init_std = embed_init_std + self.init_std = init_std + self.finetuning_task = finetuning_task + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_proj_to_labels = summary_proj_to_labels + self.summary_first_dropout = summary_first_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def vocab_size(self): + return self.n_words + + @vocab_size.setter + def vocab_size(self, value): + self.n_words = value + + @property + def hidden_size(self): + return self.emb_dim + + @property + def num_attention_heads(self): + return self.n_heads + + @property + def num_hidden_layers(self): + return self.n_layers diff --git a/pytorch_transformers/configuration_xlnet.py b/pytorch_transformers/configuration_xlnet.py new file mode 100644 index 00000000000..204d44aa728 --- /dev/null +++ b/pytorch_transformers/configuration_xlnet.py @@ -0,0 +1,172 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" XLNet configuration """ +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", + 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", +} + + +class XLNetConfig(PretrainedConfig): + """Configuration class to store the configuration of a ``XLNetModel``. + + Args: + vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. + d_model: Size of the encoder layers and the pooler layer. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + d_inner: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + ff_activation: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r: untie relative position biases + attn_type: 'bi' for XLNet, 'uni' for Transformer-XL + + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + dropatt: The dropout ratio for the attention + probabilities. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + + dropout: float, dropout rate. + dropatt: float, dropout rate on attention probabilities. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + finetuning_task: name of the glue task on which the model was fine-tuned if any + """ + pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=32000, + d_model=1024, + n_layer=24, + n_head=16, + d_inner=4096, + ff_activation="gelu", + untie_r=True, + attn_type="bi", + + initializer_range=0.02, + layer_norm_eps=1e-12, + + dropout=0.1, + mem_len=None, + reuse_len=None, + bi_data=False, + clamp_len=-1, + same_length=False, + + finetuning_task=None, + num_labels=2, + summary_type='last', + summary_use_proj=True, + summary_activation='tanh', + summary_last_dropout=0.1, + start_n_top=5, + end_n_top=5, + **kwargs): + """Constructs XLNetConfig. + """ + super(XLNetConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.n_token = vocab_size_or_config_json_file + self.d_model = d_model + self.n_layer = n_layer + self.n_head = n_head + assert d_model % n_head == 0 + self.d_head = d_model // n_head + self.ff_activation = ff_activation + self.d_inner = d_inner + self.untie_r = untie_r + self.attn_type = attn_type + + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + self.dropout = dropout + self.mem_len = mem_len + self.reuse_len = reuse_len + self.bi_data = bi_data + self.clamp_len = clamp_len + self.same_length = same_length + + self.finetuning_task = finetuning_task + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_last_dropout = summary_last_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def max_position_embeddings(self): + return -1 + + @property + def vocab_size(self): + return self.n_token + + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + + @property + def hidden_size(self): + return self.d_model + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py index e9bfa0302ad..eb5b3009b4c 100755 --- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py @@ -21,7 +21,7 @@ from io import open import torch -from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, +from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2) diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py index 3009f8a99e1..5eecdd9648c 100755 --- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py @@ -21,7 +21,7 @@ from io import open import torch -from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, +from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt) diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py index 4857ea8d808..15fd6bf5acf 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py @@ -20,7 +20,7 @@ import argparse import torch import numpy as np import tensorflow as tf -from pytorch_transformers.modeling import BertModel +from pytorch_transformers import BertModel def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py index 743013e4c49..9f74254daa8 100644 --- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py @@ -23,12 +23,12 @@ import torch from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer -from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder, +from pytorch_transformers import (BertConfig, BertEncoder, BertIntermediate, BertLayer, BertModel, BertOutput, BertSelfAttention, BertSelfOutput) -from pytorch_transformers.modeling_roberta import (RobertaEmbeddings, +from pytorch_transformers import (RobertaEmbeddings, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaModel) diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py index 220204f36ea..d382d3588e2 100755 --- a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py @@ -21,7 +21,7 @@ from __future__ import print_function import argparse import torch -from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert +from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert import logging logging.basicConfig(level=logging.INFO) diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py index 7e79d58d7da..b310b73453c 100755 --- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py @@ -26,7 +26,7 @@ import torch import pytorch_transformers.tokenization_transfo_xl as data_utils from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME -from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel, +from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl) from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py index bf4b99b6eac..d6a3cd89e7e 100755 --- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py @@ -23,7 +23,7 @@ from io import open import torch import numpy -from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME +from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES import logging diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py index 038c7069604..a36fa514b59 100755 --- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py @@ -22,7 +22,7 @@ import os import argparse import torch -from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME, +from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, XLNetConfig, XLNetLMHeadModel, XLNetForQuestionAnswering, XLNetForSequenceClassification, diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py index f6f2151b124..3fe7fa891de 100644 --- a/pytorch_transformers/file_utils.py +++ b/pytorch_transformers/file_utils.py @@ -9,6 +9,7 @@ import sys import json import logging import os +import six import shutil import tempfile import fnmatch @@ -47,8 +48,35 @@ except (AttributeError, ImportError): PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility +WEIGHTS_NAME = "pytorch_model.bin" +TF_WEIGHTS_NAME = 'model.ckpt' +CONFIG_NAME = "config.json" + logger = logging.getLogger(__name__) # pylint: disable=invalid-name +if not six.PY2: + def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = ''.join(docstr) + fn.__doc__ + return fn + return docstring_decorator + + def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + ''.join(docstr) + return fn + return docstring_decorator +else: + # Not possible to update class docstrings on python2 + def add_start_docstrings(*docstr): + def docstring_decorator(fn): + return fn + return docstring_decorator + + def add_end_docstrings(*docstr): + def docstring_decorator(fn): + return fn + return docstring_decorator def url_to_filename(url, etag=None): """ diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py index 05ff5e5b33c..31c8fafaa90 100644 --- a/pytorch_transformers/modeling_auto.py +++ b/pytorch_transformers/modeling_auto.py @@ -18,125 +18,22 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering -from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel -from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel -from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel -from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering -from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering -from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification -from .modeling_distilbert import DistilBertConfig, DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification +from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering +from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel +from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel +from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel +from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering +from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification +from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification -from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings +from .modeling_utils import PreTrainedModel, SequenceSummary + +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -class AutoConfig(object): - r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class - that will be instantiated as one of the configuration classes of the library - when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` - class method. - - The `from_pretrained()` method take care of returning the correct model class instance - using pattern matching on the `pretrained_model_name_or_path` string. - - The base model class to instantiate is selected as the first pattern matching - in the `pretrained_model_name_or_path` string (in the following order): - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - This class cannot be instantiated using `__init__()` (throw an error). - """ - def __init__(self): - raise EnvironmentError("AutoConfig is designed to be instantiated " - "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiate a one of the configuration classes of the library - from a pre-trained model configuration. - - The configuration class to instantiate is selected as the first pattern matching - in the `pretrained_model_name_or_path` string (in the following order): - - contains `distilbert`: DistilBertConfig (DistilBERT model) - - contains `bert`: BertConfig (Bert model) - - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - - contains `xlnet`: XLNetConfig (XLNet model) - - contains `xlm`: XLMConfig (XLM model) - - contains `roberta`: RobertaConfig (RoBERTa model) - - Params: - pretrained_model_name_or_path: either: - - - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - - cache_dir: (`optional`) string: - Path to a directory in which a downloaded pre-trained model - configuration should be cached if the standard cache should not be used. - - kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. - - - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. - - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - return_unused_kwargs: (`optional`) bool: - - - If False, then this function returns just the final configuration object. - - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - - Examples:: - - config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. - config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` - config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') - config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) - assert config.output_attention == True - config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, - foo=False, return_unused_kwargs=True) - assert config.output_attention == True - assert unused_kwargs == {'foo': False} - - """ - if 'distilbert' in pretrained_model_name_or_path: - return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: - return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'bert' in pretrained_model_name_or_path: - return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: - return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: - return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: - return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: - return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: - return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta'".format(pretrained_model_name_or_path)) - - class AutoModel(object): r""" :class:`~pytorch_transformers.AutoModel` is a generic model class diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index 5c71bedba9d..c541d18da52 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -28,8 +28,9 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, - prune_linear_layer, add_start_docstrings) +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_bert import BertConfig +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -49,23 +50,6 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", } -BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", - 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", -} - - def load_tf_weights_in_bert(model, config, tf_checkpoint_path): """ Load tf checkpoints in a pytorch model. """ @@ -149,77 +133,6 @@ def swish(x): ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} -class BertConfig(PretrainedConfig): - r""" - :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a - `BertModel`. - - - Arguments: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - """ - pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__(self, - vocab_size_or_config_json_file=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - **kwargs): - super(BertConfig, self).__init__(**kwargs) - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - - - try: from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm except (ImportError, AttributeError) as e: diff --git a/pytorch_transformers/modeling_distilbert.py b/pytorch_transformers/modeling_distilbert.py index 280270381cb..29ecbfb846b 100644 --- a/pytorch_transformers/modeling_distilbert.py +++ b/pytorch_transformers/modeling_distilbert.py @@ -31,7 +31,9 @@ import numpy as np import torch import torch.nn as nn -from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_distilbert import DistilBertConfig +from .file_utils import add_start_docstrings import logging logger = logging.getLogger(__name__) @@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" } -DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" -} - - -class DistilBertConfig(PretrainedConfig): - pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__(self, - vocab_size_or_config_json_file=30522, - max_position_embeddings=512, - sinusoidal_pos_embds=True, - n_layers=6, - n_heads=12, - dim=768, - hidden_dim=4*768, - dropout=0.1, - attention_dropout=0.1, - activation='gelu', - initializer_range=0.02, - tie_weights_=True, - qa_dropout=0.1, - seq_classif_dropout=0.2, - **kwargs): - super(DistilBertConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.max_position_embeddings = max_position_embeddings - self.sinusoidal_pos_embds = sinusoidal_pos_embds - self.n_layers = n_layers - self.n_heads = n_heads - self.dim = dim - self.hidden_dim = hidden_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation = activation - self.initializer_range = initializer_range - self.tie_weights_ = tie_weights_ - self.qa_dropout = qa_dropout - self.seq_classif_dropout = seq_classif_dropout - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - @property - def hidden_size(self): - return self.dim - - @property - def num_attention_heads(self): - return self.n_heads - - @property - def num_hidden_layers(self): - return self.n_layers - ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### def gelu(x): diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index d16448beaac..42686411875 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -30,19 +30,15 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss from torch.nn.parameter import Parameter -from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, - PreTrainedModel, prune_conv1d_layer, SequenceSummary, - add_start_docstrings) -from .modeling_bert import BertLayerNorm as LayerNorm +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_gpt2 import GPT2Config +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"} -GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", - "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"} def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model @@ -102,120 +98,6 @@ def gelu(x): return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) -class GPT2Config(PretrainedConfig): - """Configuration class to store the configuration of a `GPT2Model`. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__( - self, - vocab_size_or_config_json_file=50257, - n_positions=1024, - n_ctx=1024, - n_embd=768, - n_layer=12, - n_head=12, - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - - num_labels=1, - summary_type='cls_index', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - **kwargs - ): - """Constructs GPT2Config. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - layer_norm_epsilon: epsilon to use in the layer norm layers - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - super(GPT2Config, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - else: - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) - - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer - - - class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super(Attention, self).__init__() @@ -336,9 +218,9 @@ class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super(Block, self).__init__() nx = config.n_embd - self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) - self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None): @@ -377,7 +259,7 @@ class GPT2PreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() - elif isinstance(module, LayerNorm): + elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) @@ -469,7 +351,7 @@ class GPT2Model(GPT2PreTrainedModel): self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) - self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index 4fbec7a768b..9936e720304 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -30,15 +30,13 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss from torch.nn.parameter import Parameter -from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, - PreTrainedModel, prune_conv1d_layer, SequenceSummary, - add_start_docstrings) -from .modeling_bert import BertLayerNorm as LayerNorm +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_openai import OpenAIGPTConfig +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} -OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"} def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): @@ -127,111 +125,6 @@ def swish(x): ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} -class OpenAIGPTConfig(PretrainedConfig): - """ - Configuration class to store the configuration of a `OpenAIGPTModel`. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. - n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) - n_positions: Number of positional embeddings. - n_ctx: Size of the causal mask (usually same as n_positions). - n_embd: Dimensionality of the embeddings and hidden states. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - afn: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - resid_pdrop: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attn_pdrop: The dropout ratio for the attention - probabilities. - embd_pdrop: The dropout ratio for the embeddings. - layer_norm_epsilon: epsilon to use in the layer norm layers - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - predict_special_tokens: should we predict special tokens (when the model has a LM head) - """ - pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__( - self, - vocab_size_or_config_json_file=40478, - n_positions=512, - n_ctx=512, - n_embd=768, - n_layer=12, - n_head=12, - afn="gelu", - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - predict_special_tokens=True, - - num_labels=1, - summary_type='cls_index', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - **kwargs - ): - """Constructs OpenAIGPTConfig. - """ - super(OpenAIGPTConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.afn = afn - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.predict_special_tokens = predict_special_tokens - - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - else: - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) - - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer - - class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super(Attention, self).__init__() @@ -346,9 +239,9 @@ class Block(nn.Module): super(Block, self).__init__() nx = config.n_embd self.attn = Attention(nx, n_ctx, config, scale) - self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) - self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) def forward(self, x, attention_mask=None, head_mask=None): attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask) @@ -380,7 +273,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: module.bias.data.zero_() - elif isinstance(module, LayerNorm): + elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py index 0694e764158..41027330a85 100644 --- a/pytorch_transformers/modeling_roberta.py +++ b/pytorch_transformers/modeling_roberta.py @@ -22,14 +22,11 @@ import logging import torch import torch.nn as nn -import torch.nn.functional as F from torch.nn import CrossEntropyLoss, MSELoss -from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings, - BertLayerNorm, BertModel, - BertPreTrainedModel, gelu) - -from pytorch_transformers.modeling_utils import add_start_docstrings +from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu +from .configuration_roberta import RobertaConfig +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -39,13 +36,6 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", } -ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", -} - - class RobertaEmbeddings(BertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. @@ -66,10 +56,6 @@ class RobertaEmbeddings(BertEmbeddings): position_ids=position_ids) -class RobertaConfig(BertConfig): - pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP - - ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py index 63303dae5e9..7ad9d10891f 100644 --- a/pytorch_transformers/modeling_transfo_xl.py +++ b/pytorch_transformers/modeling_transfo_xl.py @@ -34,18 +34,16 @@ import torch.nn.functional as F from torch.nn import CrossEntropyLoss from torch.nn.parameter import Parameter -from .modeling_bert import BertLayerNorm as LayerNorm +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_transfo_xl import TransfoXLConfig from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits -from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings) +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", } -TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", -} def build_tf_to_pytorch_map(model, config): """ A map of modules from TF to PyTorch. @@ -175,143 +173,6 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): return model -class TransfoXLConfig(PretrainedConfig): - """Configuration class to store the configuration of a `TransfoXLModel`. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. - cutoffs: cutoffs for the adaptive softmax - d_model: Dimensionality of the model's hidden states. - d_embed: Dimensionality of the embeddings - d_head: Dimensionality of the model's heads. - div_val: divident value for adapative input and softmax - pre_lnorm: apply LayerNorm to the input instead of the output - d_inner: Inner dimension in FF - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - tgt_len: number of tokens to predict - ext_len: length of the extended context - mem_len: length of the retained previous heads - same_length: use the same attn length for all tokens - proj_share_all_but_first: True to share all but first projs, False not to share. - attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. - clamp_len: use the same pos embeddings after clamp_len - sample_softmax: number of samples in sampled softmax - adaptive: use adaptive softmax - tie_weight: tie the word embedding and softmax weights - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - dropatt: The dropout ratio for the attention probabilities. - untie_r: untie relative position biases - embd_pdrop: The dropout ratio for the embeddings. - init: parameter initializer to use - init_range: parameters initialized by U(-init_range, init_range). - proj_init_std: parameters initialized by N(0, init_std) - init_std: parameters initialized by N(0, init_std) - """ - pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__(self, - vocab_size_or_config_json_file=267735, - cutoffs=[20000, 40000, 200000], - d_model=1024, - d_embed=1024, - n_head=16, - d_head=64, - d_inner=4096, - div_val=4, - pre_lnorm=False, - n_layer=18, - tgt_len=128, - ext_len=0, - mem_len=1600, - clamp_len=1000, - same_length=True, - proj_share_all_but_first=True, - attn_type=0, - sample_softmax=-1, - adaptive=True, - tie_weight=True, - dropout=0.1, - dropatt=0.0, - untie_r=True, - init="normal", - init_range=0.01, - proj_init_std=0.01, - init_std=0.02, - **kwargs): - """Constructs TransfoXLConfig. - """ - super(TransfoXLConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.n_token = vocab_size_or_config_json_file - self.cutoffs = [] - self.cutoffs.extend(cutoffs) - self.tie_weight = tie_weight - if proj_share_all_but_first: - self.tie_projs = [False] + [True] * len(self.cutoffs) - else: - self.tie_projs = [False] + [False] * len(self.cutoffs) - self.d_model = d_model - self.d_embed = d_embed - self.d_head = d_head - self.d_inner = d_inner - self.div_val = div_val - self.pre_lnorm = pre_lnorm - self.n_layer = n_layer - self.n_head = n_head - self.tgt_len = tgt_len - self.ext_len = ext_len - self.mem_len = mem_len - self.same_length = same_length - self.attn_type = attn_type - self.clamp_len = clamp_len - self.sample_softmax = sample_softmax - self.adaptive = adaptive - self.dropout = dropout - self.dropatt = dropatt - self.untie_r = untie_r - self.init = init - self.init_range = init_range - self.proj_init_std = proj_init_std - self.init_std = init_std - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - - @property - def max_position_embeddings(self): - return self.tgt_len + self.ext_len + self.mem_len - - @property - def vocab_size(self): - return self.n_token - - @vocab_size.setter - def vocab_size(self, value): - self.n_token = value - - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer - - class PositionalEmbedding(nn.Module): def __init__(self, demb): super(PositionalEmbedding, self).__init__() @@ -347,7 +208,7 @@ class PositionwiseFF(nn.Module): nn.Dropout(dropout), ) - self.layer_norm = LayerNorm(d_model) + self.layer_norm = nn.LayerNorm(d_model) self.pre_lnorm = pre_lnorm @@ -387,7 +248,7 @@ class MultiHeadAttn(nn.Module): self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) - self.layer_norm = LayerNorm(d_model) + self.layer_norm = nn.LayerNorm(d_model) self.scale = 1 / (d_head ** 0.5) @@ -477,7 +338,7 @@ class RelMultiHeadAttn(nn.Module): self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) - self.layer_norm = LayerNorm(d_model) + self.layer_norm = nn.LayerNorm(d_model) self.scale = 1 / (d_head ** 0.5) @@ -1135,7 +996,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): mlen = mems[0].size(0) if mems is not None else 0 klen = mlen + qlen if self.same_length: - all_ones = word_emb.new_ones(qlen, klen) + all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len @@ -1145,7 +1006,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 else: dec_attn_mask = torch.triu( - word_emb.new_ones(qlen, klen), diagonal=1+mlen)[:,:,None] + word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] hids = [] attentions = [] diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index d482addb83d..2fb4671674a 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -30,14 +30,11 @@ from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F -from .file_utils import cached_path +from .configuration_utils import PretrainedConfig +from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME logger = logging.getLogger(__name__) -CONFIG_NAME = "config.json" -WEIGHTS_NAME = "pytorch_model.bin" -TF_WEIGHTS_NAME = 'model.ckpt' - try: from torch.nn import Identity @@ -52,209 +49,6 @@ except ImportError: def forward(self, input): return input - -if not six.PY2: - def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = ''.join(docstr) + fn.__doc__ - return fn - return docstring_decorator - - def add_end_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = fn.__doc__ + ''.join(docstr) - return fn - return docstring_decorator -else: - # Not possible to update class docstrings on python2 - def add_start_docstrings(*docstr): - def docstring_decorator(fn): - return fn - return docstring_decorator - - def add_end_docstrings(*docstr): - def docstring_decorator(fn): - return fn - return docstring_decorator - - -class PretrainedConfig(object): - r""" Base class for all configuration classes. - Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. - - Note: - A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. - It only affects the model's configuration. - - Class attributes (overridden by derived classes): - - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values. - - Parameters: - ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. - ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens) - ``output_attentions``: boolean, default `False`. Should the model returns attentions weights. - ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states. - ``torchscript``: string, default `False`. Is the model used with Torchscript. - """ - pretrained_config_archive_map = {} - - def __init__(self, **kwargs): - self.finetuning_task = kwargs.pop('finetuning_task', None) - self.num_labels = kwargs.pop('num_labels', 2) - self.output_attentions = kwargs.pop('output_attentions', False) - self.output_hidden_states = kwargs.pop('output_hidden_states', False) - self.torchscript = kwargs.pop('torchscript', False) - self.pruned_heads = kwargs.pop('pruned_heads', {}) - - def save_pretrained(self, save_directory): - """ Save a configuration object to the directory `save_directory`, so that it - can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method. - """ - assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" - - # If we save using the predefined names, we can load using `from_pretrained` - output_config_file = os.path.join(save_directory, CONFIG_NAME) - - self.to_json_file(output_config_file) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. - - Parameters: - pretrained_model_name_or_path: either: - - - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - - cache_dir: (`optional`) string: - Path to a directory in which a downloaded pre-trained model - configuration should be cached if the standard cache should not be used. - - kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. - - - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. - - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - return_unused_kwargs: (`optional`) bool: - - - If False, then this function returns just the final configuration object. - - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - - Examples:: - - # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a - # derived class: BertConfig - config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. - config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` - config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') - config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) - assert config.output_attention == True - config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, - foo=False, return_unused_kwargs=True) - assert config.output_attention == True - assert unused_kwargs == {'foo': False} - - """ - cache_dir = kwargs.pop('cache_dir', None) - force_download = kwargs.pop('force_download', False) - proxies = kwargs.pop('proxies', None) - return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) - - if pretrained_model_name_or_path in cls.pretrained_config_archive_map: - config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) - else: - config_file = pretrained_model_name_or_path - # redirect to the cache, if necessary - try: - resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) - except EnvironmentError as e: - if pretrained_model_name_or_path in cls.pretrained_config_archive_map: - logger.error( - "Couldn't reach server at '{}' to download pretrained model configuration file.".format( - config_file)) - else: - logger.error( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name_or_path, - ', '.join(cls.pretrained_config_archive_map.keys()), - config_file)) - raise e - if resolved_config_file == config_file: - logger.info("loading configuration file {}".format(config_file)) - else: - logger.info("loading configuration file {} from cache at {}".format( - config_file, resolved_config_file)) - - # Load config - config = cls.from_json_file(resolved_config_file) - - if hasattr(config, 'pruned_heads'): - config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items()) - - # Update config with kwargs if needed - to_remove = [] - for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) - to_remove.append(key) - for key in to_remove: - kwargs.pop(key, None) - - logger.info("Model config %s", config) - if return_unused_kwargs: - return config, kwargs - else: - return config - - @classmethod - def from_dict(cls, json_object): - """Constructs a `Config` from a Python dictionary of parameters.""" - config = cls(vocab_size_or_config_json_file=-1) - for key, value in json_object.items(): - config.__dict__[key] = value - return config - - @classmethod - def from_json_file(cls, json_file): - """Constructs a `BertConfig` from a json file of parameters.""" - with open(json_file, "r", encoding='utf-8') as reader: - text = reader.read() - return cls.from_dict(json.loads(text)) - - def __eq__(self, other): - return self.__dict__ == other.__dict__ - - def __repr__(self): - return str(self.to_json_string()) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output - - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" - - def to_json_file(self, json_file_path): - """ Save this instance to a json file.""" - with open(json_file_path, "w", encoding='utf-8') as writer: - writer.write(self.to_json_string()) - - class PreTrainedModel(nn.Module): r""" Base class for all models. diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index 7e13f104950..34406f5b616 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -16,11 +16,8 @@ """ from __future__ import absolute_import, division, print_function, unicode_literals -import json import logging import math -import sys -from io import open import itertools import numpy as np @@ -30,8 +27,9 @@ from torch import nn from torch.nn import functional as F from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings, - prune_linear_layer, SequenceSummary, SQuADHead) +from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead +from .configuration_xlm import XLMConfig +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -47,164 +45,6 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin", 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin", } -XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", -} - - -class XLMConfig(PretrainedConfig): - """Configuration class to store the configuration of a `XLMModel`. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLM, 'uni' for Transformer-XL - - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - dropatt: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - - dropout: float, dropout rate. - dropatt: float, dropout rate on attention probabilities. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. - """ - pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__(self, - vocab_size_or_config_json_file=30145, - emb_dim=2048, - n_layers=12, - n_heads=16, - dropout=0.1, - attention_dropout=0.1, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=1, - use_lang_emb=True, - max_position_embeddings=512, - embed_init_std=2048 ** -0.5, - layer_norm_eps=1e-12, - init_std=0.02, - bos_index=0, - eos_index=1, - pad_index=2, - unk_index=3, - mask_index=5, - is_encoder=True, - - finetuning_task=None, - num_labels=2, - summary_type='first', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - start_n_top=5, - end_n_top=5, - **kwargs): - """Constructs XLMConfig. - """ - super(XLMConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.n_words = vocab_size_or_config_json_file - self.emb_dim = emb_dim - self.n_layers = n_layers - self.n_heads = n_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.gelu_activation = gelu_activation - self.sinusoidal_embeddings = sinusoidal_embeddings - self.causal = causal - self.asm = asm - self.n_langs = n_langs - self.use_lang_emb = use_lang_emb - self.layer_norm_eps = layer_norm_eps - self.bos_index = bos_index - self.eos_index = eos_index - self.pad_index = pad_index - self.unk_index = unk_index - self.mask_index = mask_index - self.is_encoder = is_encoder - self.max_position_embeddings = max_position_embeddings - self.embed_init_std = embed_init_std - self.init_std = init_std - self.finetuning_task = finetuning_task - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_proj_to_labels = summary_proj_to_labels - self.summary_first_dropout = summary_first_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - - @property - def vocab_size(self): - return self.n_words - - @vocab_size.setter - def vocab_size(self, value): - self.n_words = value - - @property - def hidden_size(self): - return self.emb_dim - - @property - def num_attention_heads(self): - return self.n_heads - - @property - def num_hidden_layers(self): - return self.n_layers def create_sinusoidal_embeddings(n_pos, dim, out): diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index 0dfb33a27ab..cc350bc9563 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -29,9 +29,9 @@ from torch import nn from torch.nn import functional as F from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, - SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, - add_start_docstrings) +from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits +from .configuration_xlnet import XLNetConfig +from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -40,10 +40,6 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = { 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin", 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin", } -XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", - 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", -} def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): @@ -192,147 +188,6 @@ def swish(x): ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} -class XLNetConfig(PretrainedConfig): - """Configuration class to store the configuration of a ``XLNetModel``. - - Args: - vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for XLNet, 'uni' for Transformer-XL - - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - dropatt: The dropout ratio for the attention - probabilities. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - - dropout: float, dropout rate. - dropatt: float, dropout rate on attention probabilities. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. - finetuning_task: name of the glue task on which the model was fine-tuned if any - """ - pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP - - def __init__(self, - vocab_size_or_config_json_file=32000, - d_model=1024, - n_layer=24, - n_head=16, - d_inner=4096, - ff_activation="gelu", - untie_r=True, - attn_type="bi", - - initializer_range=0.02, - layer_norm_eps=1e-12, - - dropout=0.1, - mem_len=None, - reuse_len=None, - bi_data=False, - clamp_len=-1, - same_length=False, - - finetuning_task=None, - num_labels=2, - summary_type='last', - summary_use_proj=True, - summary_activation='tanh', - summary_last_dropout=0.1, - start_n_top=5, - end_n_top=5, - **kwargs): - """Constructs XLNetConfig. - """ - super(XLNetConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.n_token = vocab_size_or_config_json_file - self.d_model = d_model - self.n_layer = n_layer - self.n_head = n_head - assert d_model % n_head == 0 - self.d_head = d_model // n_head - self.ff_activation = ff_activation - self.d_inner = d_inner - self.untie_r = untie_r - self.attn_type = attn_type - - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - self.dropout = dropout - self.mem_len = mem_len - self.reuse_len = reuse_len - self.bi_data = bi_data - self.clamp_len = clamp_len - self.same_length = same_length - - self.finetuning_task = finetuning_task - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_last_dropout = summary_last_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - - @property - def max_position_embeddings(self): - return -1 - - @property - def vocab_size(self): - return self.n_token - - @vocab_size.setter - def vocab_size(self, value): - self.n_token = value - - @property - def hidden_size(self): - return self.d_model - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer - - try: from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm except (ImportError, AttributeError) as e: @@ -647,6 +502,12 @@ XLNET_INPUTS_DOCSTRING = r""" Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and + the important thing is that they should be different for tokens which belong to different segments. + The model will compute relative segment differences from the given type indices: + 0 if the segment id of two tokens are the same, 1 if not. **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: diff --git a/pytorch_transformers/tests/configuration_common_test.py b/pytorch_transformers/tests/configuration_common_test.py new file mode 100644 index 00000000000..8ee751153c1 --- /dev/null +++ b/pytorch_transformers/tests/configuration_common_test.py @@ -0,0 +1,63 @@ +# coding=utf-8 +# Copyright 2019 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import os +import shutil +import json +import random +import uuid + +import unittest +import logging + + +class ConfigTester(object): + def __init__(self, parent, config_class=None, **kwargs): + self.parent = parent + self.config_class = config_class + self.inputs_dict = kwargs + + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, 'vocab_size')) + self.parent.assertTrue(hasattr(config, 'hidden_size')) + self.parent.assertTrue(hasattr(config, 'num_attention_heads')) + self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) + + def create_and_test_config_to_json_string(self): + config = self.config_class(**self.inputs_dict) + obj = json.loads(config.to_json_string()) + for key, value in self.inputs_dict.items(): + self.parent.assertEqual(obj[key], value) + + def create_and_test_config_to_json_file(self): + config_first = self.config_class(**self.inputs_dict) + json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json") + config_first.to_json_file(json_file_path) + config_second = self.config_class.from_json_file(json_file_path) + os.remove(json_file_path) + self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + + def run_common_tests(self): + self.create_and_test_config_common_properties() + self.create_and_test_config_to_json_string() + self.create_and_test_config_to_json_file() + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py index 09d09b28fc2..dfdedbbe612 100644 --- a/pytorch_transformers/tests/modeling_auto_test.py +++ b/pytorch_transformers/tests/modeling_auto_test.py @@ -28,7 +28,8 @@ from pytorch_transformers import (AutoConfig, BertConfig, AutoModelForQuestionAnswering, BertForQuestionAnswering) from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class AutoModelTest(unittest.TestCase): diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py index e1cce38479a..2919cc03368 100644 --- a/pytorch_transformers/tests/modeling_bert_test.py +++ b/pytorch_transformers/tests/modeling_bert_test.py @@ -26,7 +26,8 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM, BertForTokenClassification, BertForMultipleChoice) from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class BertModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py index 8b1a70fcf3c..c50d6678d8e 100644 --- a/pytorch_transformers/tests/modeling_common_test.py +++ b/pytorch_transformers/tests/modeling_common_test.py @@ -28,9 +28,9 @@ import logging import torch -from pytorch_transformers import PretrainedConfig, PreTrainedModel -from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP -from pytorch_transformers.modeling_gpt2 import GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from pytorch_transformers import (PretrainedConfig, PreTrainedModel, + BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) def _config_zero_init(config): diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py index 7b0b6b8266c..0d9f2311777 100644 --- a/pytorch_transformers/tests/modeling_distilbert_test.py +++ b/pytorch_transformers/tests/modeling_distilbert_test.py @@ -17,14 +17,12 @@ from __future__ import division from __future__ import print_function import unittest -import shutil -import pytest from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification) -from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class DistilBertModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py index 1786ada54cd..2717805120e 100644 --- a/pytorch_transformers/tests/modeling_gpt2_test.py +++ b/pytorch_transformers/tests/modeling_gpt2_test.py @@ -24,7 +24,8 @@ import shutil from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2DoubleHeadsModel) -from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class GPT2ModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py index 0fcb4b7d64d..dbef6c52eb8 100644 --- a/pytorch_transformers/tests/modeling_openai_test.py +++ b/pytorch_transformers/tests/modeling_openai_test.py @@ -24,7 +24,8 @@ import shutil from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) -from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py index 0279f3756bf..69981af2227 100644 --- a/pytorch_transformers/tests/modeling_roberta_test.py +++ b/pytorch_transformers/tests/modeling_roberta_test.py @@ -24,7 +24,8 @@ import torch from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification) from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class RobertaModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py index e3c0fbcdf0c..f482c472022 100644 --- a/pytorch_transformers/tests/modeling_transfo_xl_test.py +++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py @@ -16,9 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os import unittest -import json import random import shutil import pytest @@ -28,7 +26,8 @@ import torch from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class TransfoXLModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py index 4308c18d45b..dcd09634770 100644 --- a/pytorch_transformers/tests/modeling_xlm_test.py +++ b/pytorch_transformers/tests/modeling_xlm_test.py @@ -23,7 +23,8 @@ import pytest from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification) from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class XLMModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py index 290c5766e2f..4445bc17ac4 100644 --- a/pytorch_transformers/tests/modeling_xlnet_test.py +++ b/pytorch_transformers/tests/modeling_xlnet_test.py @@ -28,7 +28,8 @@ import torch from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering) from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester class XLNetModelTest(CommonTestCases.CommonModelTester): diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index 65f45c496c2..3da0494ac44 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -55,6 +55,22 @@ class CommonTestCases: def get_input_output_texts(self): raise NotImplementedError + def test_tokenizers_common_properties(self): + tokenizer = self.get_tokenizer() + attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token", + "pad_token", "cls_token", "mask_token"] + for attr in attributes_list: + self.assertTrue(hasattr(tokenizer, attr)) + self.assertTrue(hasattr(tokenizer, attr + "_id")) + + self.assertTrue(hasattr(tokenizer, "additional_special_tokens")) + self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids')) + + attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", + "added_tokens_decoder"] + for attr in attributes_list: + self.assertTrue(hasattr(tokenizer, attr)) + def test_save_and_load_tokenizer(self): # safety check on max_len default value so we are sure the test works tokenizer = self.get_tokenizer() diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 53b8d245b86..1e2cd59648d 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -162,58 +162,42 @@ class PreTrainedTokenizer(object): @property def bos_token_id(self): """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ - if self._bos_token is None: - logger.error("Using bos_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._bos_token) + return self.convert_tokens_to_ids(self.bos_token) @property def eos_token_id(self): """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ - if self._eos_token is None: - logger.error("Using eos_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._eos_token) + return self.convert_tokens_to_ids(self.eos_token) @property - def unk_token_is(self): + def unk_token_id(self): """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ - if self._unk_token is None: - logger.error("Using unk_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._unk_token) + return self.convert_tokens_to_ids(self.unk_token) @property def sep_token_id(self): """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ - if self._sep_token is None: - logger.error("Using sep_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._sep_token) + return self.convert_tokens_to_ids(self.sep_token) @property def pad_token_id(self): """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ - if self._pad_token is None: - logger.error("Using pad_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._pad_token) + return self.convert_tokens_to_ids(self.pad_token) @property def cls_token_id(self): """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ - if self._cls_token is None: - logger.error("Using cls_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._cls_token) + return self.convert_tokens_to_ids(self.cls_token) @property def mask_token_id(self): """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ - if self._mask_token is None: - logger.error("Using mask_token, but it is not set yet.") - return self.convert_tokens_to_ids(self._mask_token) + return self.convert_tokens_to_ids(self.mask_token) @property def additional_special_tokens_ids(self): """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ - if self._additional_special_tokens is None: - logger.error("Using additional_special_tokens, but it is not set yet.") - return self.convert_tokens_to_ids(self._additional_special_tokens) + return self.convert_tokens_to_ids(self.additional_special_tokens) def __init__(self, max_len=None, **kwargs): self._bos_token = None @@ -653,6 +637,9 @@ class PreTrainedTokenizer(object): """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id (resp. a sequence of ids), using the vocabulary. """ + if tokens is None: + return None + if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): return self._convert_token_to_id_with_added_voc(tokens) @@ -666,6 +653,9 @@ class PreTrainedTokenizer(object): return ids def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token)