Merge remote-tracking branch 'upstream/master' into convert-back-to-tf

merging in latest changes from upstream
2025-07-31 02:02:21 +06:00 · 2019-06-19 22:56:20 -04:00 · 2019-06-19 22:56:20 -04:00 · 0a4fb0da57
commit 0a4fb0da57
parent 314bc6bb4e 3763f8944d
29 changed files with 2677 additions and 644 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,9 +7,11 @@ jobs:
        steps:
            - checkout
            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest ftfy spacy
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --runslow --cov
+            - run: codecov
    build_py2:
        working_directory: ~/pytorch-pretrained-BERT
        docker:
@ -17,10 +19,11 @@ jobs:
        steps:
            - checkout
            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest spacy
-            - run: sudo pip install ftfy==4.4.3
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install spacy ftfy==4.4.3
            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --runslow
+            - run: python -m pytest -sv tests/ --runslow --cov
+            - run: codecov
 workflows:
  version: 2
  build_and_test:
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,8 @@
+[run]
+source=pytorch_pretrained_bert
+[report]
+exclude_lines =
+    pragma: no cover
+    raise
+    except
+    register_parameter
--- a/README.md
+++ b/README.md
@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == '.</w>'
 ```

+And how to use `OpenAIGPTDoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+```
+
 ### Transformer-XL

 Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
 predicted_token = tokenizer.decode([predicted_index])
 ```

+And how to use `GPT2DoubleHeadsModel`
+
+```python
+# Load pre-trained model (weights)
+model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+model.eval()
+
+#  Prepare tokenized input
+text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+tokenized_text1 = tokenizer.tokenize(text1)
+tokenized_text2 = tokenizer.tokenize(text2)
+indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
+```
+
+
 ## Doc

 Here is a detailed documentation of the classes in the package and how to use them:
@ -471,10 +516,12 @@ Here is a detailed documentation of the classes in the package and how to use th

 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump

-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
+### `from_pretrained()` method
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:

 ```python
-model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
+model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
 ```

 where
@ -491,9 +538,13 @@ where
    - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
    - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
    - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
+    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
    - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters

  - a path or url to a pretrained model archive containing:

@ -501,7 +552,12 @@ where
    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)

  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
+- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
+- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+

 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.

@ -527,6 +583,22 @@ model = GPT2Model.from_pretrained('gpt2')

 ```

+#### Cache directory
+
+`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
+- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
+- PyTorch cache home + `/pytorch_pretrained_bert/`
+  where PyTorch cache home is defined by (in this order):
+  - shell environment variable `ENV_TORCH_HOME`
+  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
+  - default: `~/.cache/torch/`
+
+Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+
+You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
 ### Serialization best-practices

 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
@ -536,6 +608,13 @@ There are three types of files you need to save to be able to reload a fine-tune
 - the configuration file of the model which is saved as a JSON file, and
 - the vocabulary (and the merges for the BPE-based models GPT and GPT-2).

+The defaults files names of these files are as follow:
+
+- the model weights file: `pytorch_model.bin`,
+- the configuration file: `config.json`,
+- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
+- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
+
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:

 ```python
@ -627,6 +706,13 @@ These configuration classes contains a few utilities to load and save configurat

 `BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).

+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `BertConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.

 We detail them here. This model takes as *inputs*:
@ -635,6 +721,7 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

 This model *outputs* a tuple composed of:

@ -752,6 +839,13 @@ where total_tokens_embeddings can be obtained as config.total_tokens_embeddings
    `total_tokens_embeddings = config.vocab_size + config.n_special`
 You should use the associate indices to index the embeddings.

+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `OpenAIConfig` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.

 We detail them here. This model takes as *inputs*:
@ -762,9 +856,10 @@ We detail them here. This model takes as *inputs*:
 - `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
    You can use it to add a third type of embedding to each input token in the sequence
    (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

 This model *outputs*:
- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)

 #### 10. `OpenAIGPTLMHeadModel`

@ -844,6 +939,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]

 `GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.

+Instantiation:
+The model can be instantiated with the following arguments:
+
+- `config`: a `GPT2Config` class instance with the configuration to build a new model.
+- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
+
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.

 We detail them here. This model takes as *inputs*:
@ -855,9 +957,10 @@ We detail them here. This model takes as *inputs*:
    You can use it to add a third type of embedding to each input token in the sequence
    (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
 - `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
+- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

 This model *outputs*:
- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).

 #### 15. `GPT2LMHeadModel`
@ -1033,7 +1136,7 @@ An overview of the implemented schedules:
 |-|-|
 | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
 | [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` |
-| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#Fine-tuning-with-OpenAI-GPT-Transformer-XL-and-GPT-2) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
+| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#openai-gpt-transformer-xl-and-gpt-2-running-the-examples) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
 | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|

 ### Training large models: introduction, tools and examples
@ -1354,6 +1457,25 @@ The results were similar to the above FP32 results (actually slightly higher):
 {"exact_match": 84.65468306527909, "f1": 91.238669287002}
 ```

+Here is an example with the recent `bert-large-uncased-whole-word-masking`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 \
+  run_squad.py \
+  --bert_model bert-large-uncased-whole-word-masking \
+  --do_train \
+  --do_predict \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
 ## Notebooks

 We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
--- a/examples/bertology.py
+++ b/examples/bertology.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_pretrained_bert import BertModel, BertTokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def run_model():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased',
+                                                help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--batch_size", type=int, default=-1)
+    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.batch_size == -1:
+        args.batch_size = 1
+    assert args.nsamples % args.batch_size == 0
+
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+    model.to(device)
+    model.eval()
+
+    if args.length == -1:
+        args.length = model.config.n_ctx // 2
+    elif args.length > model.config.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
+
+    while True:
+        context_tokens = []
+        if not args.unconditional:
+            raw_text = input("Model prompt >>> ")
+            while not raw_text:
+                print('Prompt should not be empty!')
+                raw_text = input("Model prompt >>> ")
+            context_tokens = enc.encode(raw_text)
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=context_tokens,
+                    start_token=None,
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:, len(context_tokens):].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
+        else:
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=None,
+                    start_token=enc.encoder['<|endoftext|>'],
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:,1:].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
+
+if __name__ == '__main__':
+    run_model()
+
+
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@ -1,5 +1,6 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import os
 import torch
 import logging
 import json
@ -12,6 +13,7 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
@ -325,8 +327,13 @@ def main():
    # Save a trained model
    logging.info("** ** * Saving fine-tuned model ** ** * ")
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = args.output_dir / "pytorch_model.bin"
-    torch.save(model_to_save.state_dict(), str(output_model_file))
+    
+    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+    torch.save(model_to_save.state_dict(), output_model_file)
+    model_to_save.config.to_json_file(output_config_file)
+    tokenizer.save_vocabulary(args.output_dir)


 if __name__ == '__main__':
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@ -4,11 +4,11 @@ from tqdm import tqdm, trange
 from tempfile import TemporaryDirectory
 import shelve

-from random import random, randrange, randint, shuffle, choice, sample
+from random import random, randrange, randint, shuffle, choice
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 import numpy as np
 import json
-
+import collections

 class DocumentDatabase:
    def __init__(self, reduce_memory=False):
@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
        else:
            trunc_tokens.pop()

+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])

-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):
+def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
    """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
    with several refactors to clean it up and remove a lot of unnecessary variables."""
    cand_indices = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
-        cand_indices.append(i)
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word. When a word has been split into
+        # WordPieces, the first token does not have any marker and any subsequence
+        # tokens are prefixed with ##. So whenever we see the ## token, we
+        # append it to the previous set of word indexes.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
+            cand_indices[-1].append(i)
+        else:
+            cand_indices.append([i])

    num_to_mask = min(max_predictions_per_seq,
                      max(1, int(round(len(tokens) * masked_lm_prob))))
    shuffle(cand_indices)
-    mask_indices = sorted(sample(cand_indices, num_to_mask))
-    masked_token_labels = []
-    for index in mask_indices:
-        # 80% of the time, replace with [MASK]
-        if random() < 0.8:
-            masked_token = "[MASK]"
-        else:
-            # 10% of the time, keep original
-            if random() < 0.5:
-                masked_token = tokens[index]
-            # 10% of the time, replace with random word
+    masked_lms = []
+    covered_indexes = set()
+    for index_set in cand_indices:
+        if len(masked_lms) >= num_to_mask:
+            break
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_mask:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if random() < 0.8:
+                masked_token = "[MASK]"
            else:
-                masked_token = choice(vocab_list)
-        masked_token_labels.append(tokens[index])
-        # Once we've saved the true label for that token, we can overwrite it with the masked version
-        tokens[index] = masked_token
+                # 10% of the time, keep original
+                if random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = choice(vocab_list)
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+            tokens[index] = masked_token
+
+    assert len(masked_lms) <= num_to_mask
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    mask_indices = [p.index for p in masked_lms]
+    masked_token_labels = [p.label for p in masked_lms]

    return tokens, mask_indices, masked_token_labels


 def create_instances_from_document(
        doc_database, doc_idx, max_seq_length, short_seq_prob,
-        masked_lm_prob, max_predictions_per_seq, vocab_list):
+        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
    """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
    However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
    Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
@ -213,7 +248,7 @@ def create_instances_from_document(
                segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]

                tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_list)
+                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)

                instance = {
                    "tokens": tokens,
@ -235,9 +270,10 @@ def main():
    parser.add_argument("--output_dir", type=Path, required=True)
    parser.add_argument("--bert_model", type=str, required=True,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual", "bert-base-chinese"])
+                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
    parser.add_argument("--do_lower_case", action="store_true")
-
+    parser.add_argument("--do_whole_word_mask", action="store_true",
+                        help="Whether to use whole word masking rather than per-WordPiece masking.")
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")

@ -284,7 +320,7 @@ def main():
                    doc_instances = create_instances_from_document(
                        docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
                        masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
-                        vocab_list=vocab_list)
+                        whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
                    doc_instances = [json.dumps(instance) for instance in doc_instances]
                    for instance in doc_instances:
                        epoch_file.write(instance + '\n')
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@ -29,6 +29,7 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
@ -614,9 +615,12 @@ def main():
        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
+            model_to_save.config.to_json_file(output_config_file)
+            tokenizer.save_vocabulary(args.output_dir)


 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@ -25,6 +25,7 @@ import random
 import sys

 import numpy as np
+import math
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
@ -735,15 +736,6 @@ def main():

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = processor.get_train_examples(args.data_dir)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
@ -762,8 +754,35 @@ def main():
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

-    # Prepare optimizer
    if args.do_train:
+
+        # Prepare data loader
+
+        train_examples = processor.get_train_examples(args.data_dir)
+        train_features = convert_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
@ -794,31 +813,14 @@ def main():
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

-    global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
-    if args.do_train:
-        train_features = convert_examples_to_features(
-            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        global_step = 0
+        nb_tr_steps = 0
+        tr_loss = 0
+
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@ -190,7 +190,7 @@ def main():
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
-        num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@ -617,7 +617,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                all_predictions[example.qas_id] = ""
            else:
                all_predictions[example.qas_id] = best_non_null_entry.text
-            all_nbest_json[example.qas_id] = nbest_json
+        all_nbest_json[example.qas_id] = nbest_json

    with open(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")
@ -894,16 +894,6 @@ def main():

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
                cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
@ -921,8 +911,47 @@ def main():
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

-    # Prepare optimizer
    if args.do_train:
+
+        # Prepare data loader
+
+        train_examples = read_squad_examples(
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                   all_start_positions, all_end_positions)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
@ -958,43 +987,13 @@ def main():
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

-    global_step = 0
-    if args.do_train:
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        train_features = None
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
+        global_step = 0
+
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@ -358,15 +358,6 @@ def main():

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

-    train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
    # Prepare model
    model = BertForMultipleChoice.from_pretrained(args.bert_model,
        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
@ -384,13 +375,35 @@ def main():
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

-    # Prepare optimizer
    if args.do_train:
+
+        # Prepare data loader
+
+        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
+        train_features = convert_examples_to_features(
+            train_examples, tokenizer, args.max_seq_length, True)
+        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
@ -420,24 +433,12 @@ def main():
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

-    global_step = 0
-    if args.do_train:
-        train_features = convert_examples_to_features(
-            train_examples, tokenizer, args.max_seq_length, True)
+        global_step = 0
+
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
-        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
--- a/hubconf.py
+++ b/hubconf.py
@ -1,187 +1,30 @@
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']

-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionnary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-
-    Example:
-        >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
-        >>> toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        >>> ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel.
-
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel.
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states.
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel.
-
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
+from hubconfs.bert_hubconf import (
+    bertTokenizer,
+    bertModel,
+    bertForNextSentencePrediction,
+    bertForPreTraining,
+    bertForMaskedLM,
+    bertForSequenceClassification,
+    bertForMultipleChoice,
+    bertForQuestionAnswering,
+    bertForTokenClassification
+)
+from hubconfs.gpt_hubconf import (
+    openAIGPTTokenizer,
+    openAIGPTModel,
+    openAIGPTLMHeadModel,
+    openAIGPTDoubleHeadsModel
+)
+from hubconfs.gpt2_hubconf import (
+    gpt2Tokenizer,
+    gpt2Model,
+    gpt2LMHeadModel,
+    gpt2DoubleHeadsModel
+)
+from hubconfs.transformer_xl_hubconf import (
+    transformerXLTokenizer,
+    transformerXLModel,
+    transformerXLLMHeadModel
+)
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@ -0,0 +1,360 @@
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling import (
+        BertModel,
+        BertForNextSentencePrediction,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        )
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+bert_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load
+                . `bert-base-uncased`
+                . `bert-large-uncased`
+                . `bert-base-cased`
+                . `bert-large-cased`
+                . `bert-base-multilingual-uncased`
+                . `bert-base-multilingual-cased`
+                . `bert-base-chinese`
+                . `bert-base-german-cased`
+                . `bert-large-uncased-whole-word-masking`
+                . `bert-large-cased-whole-word-masking`
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
+                  instance
+            - a path or url to a pretrained model archive containing:
+                . `bert_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow
+                 checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models
+                   will be cached.
+        state_dict: an optional state dictionnary
+                    (collections.OrderedDict object) to use instead of Google
+                    pre-trained models
+        *inputs, **kwargs: additional input for the specific Bert class
+            (ex: num_labels for BertForSequenceClassification)
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def bertTokenizer(*args, **kwargs):
+    """
+    Instantiate a BertTokenizer from a pre-trained/customized vocab file
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * bert-base-uncased
+                                       * bert-large-uncased
+                                       * bert-base-cased
+                                       * bert-large-cased
+                                       * bert-base-multilingual-uncased
+                                       * bert-base-multilingual-cased
+                                       * bert-base-chinese
+    Keyword args:
+    cache_dir: an optional path to a specific directory to download and cache
+               the pre-trained model weights.
+               Default: None
+    do_lower_case: Whether to lower case the input.
+                   Only has an effect when do_wordpiece_only=False
+                   Default: True
+    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+                       Default: True
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+    never_split: List of tokens which will never be split during tokenization.
+                 Only has an effect when do_wordpiece_only=False
+                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
+
+    Example:
+        >>> import torch
+        >>> sentence = 'Hello, World!'
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> toks = tokenizer.tokenize(sentence)
+        ['Hello', '##,', 'World', '##!']
+        >>> ids = tokenizer.convert_tokens_to_ids(toks)
+        [8667, 28136, 1291, 28125]
+    """
+    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertModel(*args, **kwargs):
+    """
+    BertModel is the basic BERT Transformer model with a layer of summed token,
+    position and sequence embeddings followed by a series of identical
+    self-attention blocks (12 for BERT-base, 24 for BERT-large).
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model.eval()
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                encoded_layers, _ = model(tokens_tensor, segments_tensors)
+    """
+    model = BertModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForNextSentencePrediction(*args, **kwargs):
+    """
+    BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence
+    classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForNextSentencePrediction
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the next sentence classification logits
+        >>> with torch.no_grad():
+                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
+    """
+    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForPreTraining(*args, **kwargs):
+    """
+    BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForPreTraining
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
+    """
+    model = BertForPreTraining.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMaskedLM(*args, **kwargs):
+    """
+    BertForMaskedLM includes the BertModel Transformer followed by the
+    (possibly) pre-trained masked language modeling head.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> masked_index = 8
+        >>> tokenized_text[masked_index] = '[MASK]'
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForMaskedLM
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model.eval()
+        # Predict all tokens
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor, segments_tensors)
+        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        'henson'
+    """
+    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForSequenceClassification(*args, **kwargs):
+    """
+    BertForSequenceClassification is a fine-tuning model that includes
+    BertModel and a sequence-level (sequence or pair of sequences) classifier
+    on top of the BertModel. Note that the classification head is only initialized
+    and has to be trained.
+
+    The sequence-level classifier is a linear layer that takes as input the
+    last hidden state of the first character in the input sequence
+    (see Figures 3a and 3b in the BERT paper).
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForSequenceClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the sequence classification logits
+        >>> with torch.no_grad():
+                seq_classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the sequence classification loss
+        >>> labels = torch.tensor([1])
+        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForMultipleChoice(*args, **kwargs):
+    """
+    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
+    linear layer on top of the BertModel. Note that the multiple choice head is
+    only initialized and has to be trained.
+
+    Args:
+    num_choices: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
+        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
+        # Load bertForMultipleChoice
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model.eval()
+        # Predict the multiple choice logits
+        >>> with torch.no_grad():
+                multiple_choice_logits = model(tokens_tensor, segments_tensors)
+        # Or get the multiple choice loss
+        >>> labels = torch.tensor([1])
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForQuestionAnswering(*args, **kwargs):
+    """
+    BertForQuestionAnswering is a fine-tuning model that includes BertModel
+    with a token-level classifiers on top of the full sequence of last hidden
+    states. Note that the classification head is only initialized
+    and has to be trained.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForQuestionAnswering
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model.eval()
+        # Predict the start and end positions logits
+        >>> with torch.no_grad():
+                start_logits, end_logits = model(tokens_tensor, segments_tensors)
+        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
+        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
+        # set model.train() before if training this loss
+        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
+    """
+    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(bert_docstring)
+def bertForTokenClassification(*args, **kwargs):
+    """
+    BertForTokenClassification is a fine-tuning model that includes BertModel
+    and a token-level classifier on top of the BertModel. Note that the classification
+    head is only initialized and has to be trained.
+
+    The token-level classifier is a linear layer that takes as input the last
+    hidden state of the sequence.
+
+    Args:
+    num_labels: the number (>=2) of classes for the classifier.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        #  Prepare tokenized input
+        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        >>> segments_tensors = torch.tensor([segments_ids])
+        # Load bertForTokenClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model.eval()
+        # Predict the token classification logits
+        >>> with torch.no_grad():
+                classif_logits = model(tokens_tensor, segments_tensors)
+        # Or get the token classification loss
+        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
+        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+    """
+    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
+    return model
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@ -0,0 +1,168 @@
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_pretrained_bert.modeling_gpt2 import (
+    GPT2Model,
+    GPT2LMHeadModel,
+    GPT2DoubleHeadsModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt2_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `gpt2`, `gpt2-medium`
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+            - a path or url to a pretrained model archive containing:
+                . `gpt2_config.json` a configuration file for the model
+                . a TensorFlow checkpoint with trained weights
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific GPT-2 class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def gpt2Tokenizer(*args, **kwargs):
+    """
+    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
+    Peculiarities:
+        - Byte-level BPE
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * gpt2
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2Model(*args, **kwargs):
+    """
+    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
+    identical stacked masked self-attention blocks and pre-trained
+    on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2Model
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+                hidden_states_1, past = model(tokens_tensor_1)
+                hidden_states_2, past = model(tokens_tensor_2, past=past)
+    """
+    model = GPT2Model.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2LMHeadModel(*args, **kwargs):
+    """
+    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
+    tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load gpt2LMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # past can be used to reuse precomputed hidden state in a subsequent predictions
+        >>> with torch.no_grad():
+                predictions_1, past = model(tokens_tensor_1)
+                predictions_2, past = model(tokens_tensor_2, past=past)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt2_docstring)
+def gpt2DoubleHeadsModel(*args, **kwargs):
+    """
+    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
+    tied (pre-trained) language modeling head and a multiple choice
+    classification head (only initialized, not pre-trained).
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+
+        #  Prepare tokenized input
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+        # Load gpt2DoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
+    """
+    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@ -0,0 +1,186 @@
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.modeling_openai import (
+	OpenAIGPTModel,
+	OpenAIGPTLMHeadModel,
+	OpenAIGPTDoubleHeadsModel
+)
+
+# Dependecies that are not specified in global hubconf.py
+specific_dependencies = ['spacy', 'ftfy']
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+gpt_docstring = """
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+		pretrained_model_name_or_path: either:
+			- a str with the name of a pre-trained model to load selected in the list of:
+				. `openai-gpt`
+			- a path or url to a pretrained model archive containing:
+				. `openai_gpt_config.json` a configuration file for the model
+				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+			- a path or url to a pretrained model archive containing:
+				. `openai-gpt-config.json` a configuration file for the model
+				. a series of NumPy files containing OpenAI TensorFlow trained weights
+		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+		state_dict: an optional state dictionnary (collections.OrderedDict object)
+		        	to use instead of pre-trained models
+		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def openAIGPTTokenizer(*args, **kwargs):
+    """
+    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
+	Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * openai-gpt
+    Keyword args:
+	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
+					Default: None
+	max_len: An artificial maximum length to truncate tokenized sequences to;
+        	 Effective maximum length is always the minimum of this
+             value (if specified) and the underlying BERT model's
+             sequence length.
+			 Default: None
+
+    Example:
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+		
+		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
+    """
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTModel(*args, **kwargs):
+    """
+    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
+	identical stacked masked self-attention blocks and pre-trained
+	on large scale dataset using language modeling signal.
+
+    Example:
+        # Load the tokenizer
+		>>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states = model(tokens_tensor)
+    """
+    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTLMHeadModel(*args, **kwargs):
+    """
+    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head on top.
+
+	Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> tokenized_text = tokenizer.tokenize(text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        >>> tokens_tensor = torch.tensor([indexed_tokens])
+
+        # Load openAIGPTLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions = model(tokens_tensor)
+
+		# Get the predicted last token
+		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
+		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        '.</w>'
+    """
+    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(gpt_docstring)
+def openAIGPTDoubleHeadsModel(*args, **kwargs):
+    """
+    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
+	tied (pre-trained) language modeling head and a multiple choice
+	classification head (only initialized, not pre-trained).
+
+	Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+
+        #  Prepare tokenized input
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+        # Load openAIGPTDoubleHeadsModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+    """
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
+    return model
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@ -0,0 +1,130 @@
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_pretrained_bert.modeling_transfo_xl import (
+    TransfoXLModel,
+    TransfoXLLMHeadModel
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+transformer_xl_docstring = """
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `transfo-xl-wt103`
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+            - a path or url to a pretrained model archive containing:
+                . `transfo_xl_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific TransformerXL class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def transformerXLTokenizer(*args, **kwargs):
+    """
+    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * transfo-xl-wt103
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        
+        >>> text = "Who was Jim Henson ?"
+        >>> tokenized_text = tokenizer.tokenize(tokenized_text)
+        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+    """
+    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLModel(*args, **kwargs):
+    """
+    transformerXLModel is the basic Transformer XL model.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                hidden_states_1, mems_1 = model(tokens_tensor_1)
+                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+    """
+    model = TransfoXLModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(transformer_xl_docstring)
+def transformerXLLMHeadModel(*args, **kwargs):
+    """
+    transformerXLModel is the basic Transformer XL model with the
+    tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
+        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
+        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load transformerXLLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        # We can re-use the memory cells in a subsequent call to attend a longer context
+        >>> with torch.no_grad():
+                predictions_1, mems_1 = model(tokens_tensor_1)
+                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        >>> assert predicted_token == 'who'
+    """
+    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
--- a/pytorch_pretrained_bert/init.py
+++ b/pytorch_pretrained_bert/init.py
@ -15,7 +15,7 @@ from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
                                  load_tf_weights_in_transfo_xl)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
                            load_tf_weights_in_gpt2)

 from .optimization import BertAdam
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@ -22,9 +22,6 @@ import json
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open

@ -37,17 +34,57 @@ from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
 logger = logging.getLogger(__name__)

 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'

+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
    """
@ -278,12 +315,16 @@ class BertEmbeddings(nn.Module):


 class BertSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
@ -299,7 +340,7 @@ class BertSelfAttention(nn.Module):
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

-    def forward(self, hidden_states, attention_mask):
+    def forward(self, hidden_states, attention_mask, head_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
@ -321,10 +362,20 @@ class BertSelfAttention(nn.Module):
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
        context_layer = torch.matmul(attention_probs, value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
        return context_layer


@ -343,14 +394,35 @@ class BertSelfOutput(nn.Module):


 class BertAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
        self.output = BertSelfOutput(config)

-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
+    def prune_heads(self, heads):
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
        attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
        return attention_output


@ -384,33 +456,47 @@ class BertOutput(nn.Module):


 class BertLayer(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                               keep_multihead_output=keep_multihead_output)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
        return layer_output


 class BertEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
        all_encoder_layers = []
-        for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states, attention_mask)
+        all_attentions = []
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
        return all_encoder_layers


@ -541,6 +627,9 @@ class BertPreTrainedModel(nn.Module):
                    . `bert-base-multilingual-uncased`
                    . `bert-base-multilingual-cased`
                    . `bert-base-chinese`
+                    . `bert-base-german-cased`
+                    . `bert-large-uncased-whole-word-masking`
+                    . `bert-large-cased-whole-word-masking`
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
@ -562,54 +651,86 @@ class BertPreTrainedModel(nn.Module):

        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
-            archive_file = pretrained_model_name_or_path
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
            return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
        else:
-            logger.info("loading archive file {} from cache at {}".format(
+            logger.info("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file) or from_tf:
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        ### Switching to split config/weight files configuration
+        # tempdir = None
+        # if os.path.isdir(resolved_archive_file) or from_tf:
+        #     serialization_dir = resolved_archive_file
+        # else:
+        #     # Extract archive to temp dir
+        #     tempdir = tempfile.mkdtemp()
+        #     logger.info("extracting archive file {} to temp dir {}".format(
+        #         resolved_archive_file, tempdir))
+        #     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+        #         archive.extractall(tempdir)
+        #     serialization_dir = tempdir
+        # config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        # if not os.path.exists(config_file):
+        #     # Backward compatibility with old naming format
+        #     config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        if not os.path.exists(config_file):
-            # Backward compatibility with old naming format
-            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
-        config = BertConfig.from_json_file(config_file)
+        config = BertConfig.from_json_file(resolved_config_file)
        logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(config, *inputs, **kwargs)
        if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu')
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
+            # weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        # if tempdir:
+        #     # Clean up temp dir
+        #     shutil.rmtree(tempdir)
        if from_tf:
            # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            # weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
            return load_tf_weights_in_bert(model, weights_path)
        # Load from a PyTorch state_dict
        old_keys = []
@ -662,7 +783,10 @@ class BertModel(BertPreTrainedModel):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").

    Params:
-        config: a BertConfig class instance with the configuration to build a new model
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -676,6 +800,9 @@ class BertModel(BertPreTrainedModel):
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controled by `output_all_encoded_layers` argument:
@ -702,14 +829,29 @@ class BertModel(BertPreTrainedModel):
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertModel, self).__init__(config)
+        self.output_attentions = output_attentions
        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions,
+                                           keep_multihead_output=keep_multihead_output)
        self.pooler = BertPooler(config)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
@ -730,14 +872,34 @@ class BertModel(BertPreTrainedModel):
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape num_hidden_layers x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
        embedding_output = self.embeddings(input_ids, token_type_ids)
        encoded_layers = self.encoder(embedding_output,
                                      extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers)
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
        return encoded_layers, pooled_output


@ -748,7 +910,10 @@ class BertForPreTraining(BertPreTrainedModel):
        - the next sentence classification head.

    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -767,6 +932,8 @@ class BertForPreTraining(BertPreTrainedModel):
        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
@ -791,15 +958,21 @@ class BertForPreTraining(BertPreTrainedModel):
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertForPreTraining, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        if masked_lm_labels is not None and next_sentence_label is not None:
@ -808,8 +981,9 @@ class BertForPreTraining(BertPreTrainedModel):
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss
            return total_loss
-        else:
-            return prediction_scores, seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score


 class BertForMaskedLM(BertPreTrainedModel):
@ -817,7 +991,10 @@ class BertForMaskedLM(BertPreTrainedModel):
    This module comprises the BERT model followed by the masked language modeling head.

    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -833,6 +1010,12 @@ class BertForMaskedLM(BertPreTrainedModel):
        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `masked_lm_labels` is  not `None`:
@ -854,23 +1037,31 @@ class BertForMaskedLM(BertPreTrainedModel):
    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertForMaskedLM, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                       output_all_encoded_layers=False,
+                                       head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
        prediction_scores = self.cls(sequence_output)

        if masked_lm_labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
            return masked_lm_loss
-        else:
-            return prediction_scores
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return prediction_scores


 class BertForNextSentencePrediction(BertPreTrainedModel):
@ -878,7 +1069,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    This module comprises the BERT model followed by the next sentence classification head.

    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -894,6 +1088,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `next_sentence_label` is not `None`:
@ -916,23 +1112,31 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertForNextSentencePrediction, self).__init__(config)
-        self.bert = BertModel(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.cls = BertOnlyNSPHead(config)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
-        seq_relationship_score = self.cls( pooled_output)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                     output_all_encoded_layers=False,
+                                     head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)

        if next_sentence_label is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            return next_sentence_loss
-        else:
-            return seq_relationship_score
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score


 class BertForSequenceClassification(BertPreTrainedModel):
@ -941,7 +1145,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
    the pooled output.

    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
        `num_labels`: the number of classes for the classifier. Default = 2.

    Inputs:
@ -957,6 +1164,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `labels` is not `None`:
@ -980,16 +1189,22 @@ class BertForSequenceClassification(BertPreTrainedModel):
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
        super(BertForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
        self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

@ -997,8 +1212,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits


 class BertForMultipleChoice(BertPreTrainedModel):
@ -1007,7 +1223,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
    the pooled output.

    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
        `num_choices`: the number of classes for the classifier. Default = 2.

    Inputs:
@ -1023,6 +1242,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `labels` is not `None`:
@ -1045,19 +1266,25 @@ class BertForMultipleChoice(BertPreTrainedModel):
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config, num_choices):
+    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
        super(BertForMultipleChoice, self).__init__(config)
+        self.output_attentions = output_attentions
        self.num_choices = num_choices
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, self.num_choices)
@ -1066,8 +1293,9 @@ class BertForMultipleChoice(BertPreTrainedModel):
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)
            return loss
-        else:
-            return reshaped_logits
+        elif self.output_attentions:
+            return all_attentions, reshaped_logits
+        return reshaped_logits


 class BertForTokenClassification(BertPreTrainedModel):
@ -1076,7 +1304,10 @@ class BertForTokenClassification(BertPreTrainedModel):
    the full hidden state of the last layer.

    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
        `num_labels`: the number of classes for the classifier. Default = 2.

    Inputs:
@ -1092,6 +1323,8 @@ class BertForTokenClassification(BertPreTrainedModel):
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `labels` is not `None`:
@ -1115,16 +1348,22 @@ class BertForTokenClassification(BertPreTrainedModel):
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config, num_labels):
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
        super(BertForTokenClassification, self).__init__(config)
+        self.output_attentions = output_attentions
        self.num_labels = num_labels
-        self.bert = BertModel(config)
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

@ -1139,8 +1378,9 @@ class BertForTokenClassification(BertPreTrainedModel):
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
-        else:
-            return logits
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits


 class BertForQuestionAnswering(BertPreTrainedModel):
@ -1149,7 +1389,10 @@ class BertForQuestionAnswering(BertPreTrainedModel):
    the sequence output that computes start_logits and end_logits

    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model.
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -1168,6 +1411,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
            into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `start_positions` and `end_positions` are not `None`:
@ -1190,16 +1435,23 @@ class BertForQuestionAnswering(BertPreTrainedModel):
    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
-        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.apply(self.init_bert_weights)

-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                                                       output_all_encoded_layers=False,
+                                                       head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
@ -1221,5 +1473,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            return total_loss
-        else:
-            return start_logits, end_logits
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@ -23,9 +23,6 @@ import json
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open

@ -39,8 +36,34 @@ from .modeling import BertLayerNorm as LayerNorm

 logger = logging.getLogger(__name__)

-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
+PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0])
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+

 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
@ -107,18 +130,24 @@ class GPT2Config(object):
    def __init__(
        self,
        vocab_size_or_config_json_file=50257,
+        n_special=0,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
+        predict_special_tokens=True
    ):
        """Constructs GPT2Config.

        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            n_embd: Dimensionality of the embeddings and hidden states.
@ -126,8 +155,14 @@ class GPT2Config(object):
            n_head: Number of attention heads for each attention layer in
                the Transformer encoder.
            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
        """
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
@ -137,19 +172,28 @@ class GPT2Config(object):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
            self.n_ctx = n_ctx
            self.n_positions = n_positions
            self.n_embd = n_embd
            self.n_layer = n_layer
            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )

+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `GPT2Config` from a Python dictionary of parameters."""
@ -200,7 +244,7 @@ class Conv1D(nn.Module):


 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
        super(Attention, self).__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@ -209,10 +253,31 @@ class Attention(nn.Module):
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
+
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
        self.c_attn = Conv1D(n_state * 3, nx)
        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)

-    def _attn(self, q, k, v):
+    def prune_heads(self, heads):
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
@ -221,6 +286,14 @@ class Attention(nn.Module):
        w = w * b - 1e4 * (1 - b)

        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
        return torch.matmul(w, v)

    def merge_heads(self, x):
@ -236,7 +309,7 @@ class Attention(nn.Module):
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

-    def forward(self, x, layer_past=None):
+    def forward(self, x, layer_past=None, head_mask=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
@ -247,9 +320,19 @@ class Attention(nn.Module):
            key = torch.cat((past_key, key), dim=-1)
            value = torch.cat((past_value, value), dim=-2)
        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
+        if self.output_attentions:
+            attentions, a = a
        a = self.merge_heads(a)
        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a, present
        return a, present


@ -260,27 +343,35 @@ class MLP(nn.Module):
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
-        return h2
+        return self.dropout(h2)


 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
        super(Block, self).__init__()
        nx = config.n_embd
+        self.output_attentions = output_attentions
        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)

-    def forward(self, x, layer_past=None):
-        a, present = self.attn(self.ln_1(x), layer_past=layer_past)
+    def forward(self, x, layer_past=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+        if self.output_attentions:
+            attentions, a, present = output_attn
+        else:
+            a, present = output_attn
        x = x + a
        m = self.mlp(self.ln_2(x))
        x = x + m
+        if self.output_attentions:
+            return attentions, x, present
        return x, present


@ -290,17 +381,20 @@ class GPT2LMHead(nn.Module):
    def __init__(self, model_embeddings_weights, config):
        super(GPT2LMHead, self).__init__()
        self.n_embd = config.n_embd
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights):
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
        self.decoder.weight = model_embeddings_weights  # Tied weights

    def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
        lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
        return lm_logits


@ -310,6 +404,7 @@ class GPT2MultipleChoiceHead(nn.Module):
    def __init__(self, config):
        super(GPT2MultipleChoiceHead, self).__init__()
        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
        self.linear = nn.Linear(config.n_embd, 1)

        nn.init.normal_(self.linear.weight, std=0.02)
@ -323,6 +418,7 @@ class GPT2MultipleChoiceHead(nn.Module):
        # (bsz, num_choices, 1, hidden_size)
        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
        # (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
        # (bsz, num_choices)
        return multiple_choice_logits
@ -345,9 +441,6 @@ class GPT2PreTrainedModel(nn.Module):
            )
        self.config = config

-    def set_tied(self):
-        pass
-
    def init_weights(self, module):
        """ Initialize the weights.
        """
@ -362,9 +455,7 @@ class GPT2PreTrainedModel(nn.Module):
            module.bias.data.zero_()

    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
@ -382,8 +473,17 @@ class GPT2PreTrainedModel(nn.Module):
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT class
+            *inputs, **kwargs: additional input for the specific GPT2 class
        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+        num_special_tokens = kwargs.get('num_special_tokens', None)
+        kwargs.pop('num_special_tokens', None)
+
        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@ -393,16 +493,37 @@ class GPT2PreTrainedModel(nn.Module):
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
                )
-            )
            return None
        if resolved_archive_file == archive_file and resolved_config_file == config_file:
            logger.info("loading weights file {}".format(archive_file))
@ -475,16 +596,37 @@ class GPT2PreTrainedModel(nn.Module):
                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
            )

-        # Make sure we are still sharing the output and input embeddings after loading weights
-        model.set_tied()
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
        return model


 class GPT2Model(GPT2PreTrainedModel):
    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").

+    GPT-2 use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
    Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@ -499,10 +641,12 @@ class GPT2Model(GPT2PreTrainedModel):
        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
            (key and values in the attention blocks) to speed up sequential decoding
            (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs a tuple consisting of:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
            torch.FloatTensors. They can be reused to speed up sequential decoding.
@ -519,17 +663,47 @@ class GPT2Model(GPT2PreTrainedModel):
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(GPT2Model, self).__init__(config)
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.output_attentions = output_attentions
+        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
-        block = Block(config.n_ctx, config, scale=True)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        self.apply(self.init_weights)

-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.wte
+        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.wte.to(old_embed.weight.device)
+        self.init_weights(self.wte)
+        # Copy word embeddings from the previous weights
+        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@ -539,6 +713,21 @@ class GPT2Model(GPT2PreTrainedModel):
            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.n_layer
+
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
@ -551,20 +740,38 @@ class GPT2Model(GPT2PreTrainedModel):
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
        presents = []
-        for block, layer_past in zip(self.h, past):
-            hidden_states, present = block(hidden_states, layer_past)
+        all_attentions = []
+        all_hidden_states = []
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            all_hidden_states.append(hidden_states.view(*output_shape))
+            outputs = block(hidden_states, layer_past, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states, present = outputs
+                all_attentions.append(attentions)
+            else:
+                hidden_states, present = outputs
            presents.append(present)
        hidden_states = self.ln_f(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-        return hidden_states.view(*output_shape), presents
+        all_hidden_states.append(hidden_states.view(*output_shape))
+
+        if self.output_attentions:
+            return all_attentions, all_hidden_states, presents
+        return all_hidden_states, presents


 class GPT2LMHeadModel(GPT2PreTrainedModel):
    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").

    Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@ -582,6 +789,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
            (key and values in the attention blocks) to speed up sequential decoding
            (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` is not `None`:
@ -604,30 +813,41 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
        self.apply(self.init_weights)

-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
        """
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]

-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        if lm_labels is not None:
            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
-
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                            shift_labels.view(-1))
            return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, presents
        return lm_logits, presents


@ -635,7 +855,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").

    Params:
-        config: a GPT2Config class instance with the configuration to build a new model
+        `config`: a GPT2Config class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@ -657,6 +880,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
            (key and values in the attention blocks) to speed up sequential decoding
            (this is the presents output of the model, cf. below).
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` and `multiple_choice_labels` are not `None`:
@ -675,37 +900,49 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):

    config = modeling_gpt2.GPT2Config()

-    model = modeling_gpt2.GPT2LMHeadModel(config)
+    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
        self.multiple_choice_head = GPT2MultipleChoiceHead(config)
        self.apply(self.init_weights)

-    def set_tied(self):
-        """ Make sure we are sharing the embeddings
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
        """
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, past=None, head_mask=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        hidden_states = hidden_states[-1]

-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
-        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
        losses = []
        if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1,
-                          shift_logits.size(-1)), shift_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
        if losses:
            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits, presents
        return lm_logits, mc_logits, presents
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@ -23,9 +23,6 @@ import json
 import logging
 import math
 import os
-import shutil
-import tarfile
-import tempfile
 import sys
 from io import open

@ -36,6 +33,7 @@ from torch.nn.parameter import Parameter

 from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 from .modeling import BertLayerNorm as LayerNorm
+from .modeling_gpt2 import prune_conv1d_layer

 logger = logging.getLogger(__name__)

@ -143,6 +141,7 @@ class OpenAIGPTConfig(object):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
+        predict_special_tokens=True
    ):
        """Constructs OpenAIGPTConfig.

@ -165,6 +164,7 @@ class OpenAIGPTConfig(object):
            layer_norm_epsilon: epsilon to use in the layer norm layers
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
        """
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
@ -186,6 +186,7 @@ class OpenAIGPTConfig(object):
            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
@ -253,7 +254,7 @@ class Conv1D(nn.Module):


 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
        super(Attention, self).__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@ -262,12 +263,31 @@ class Attention(nn.Module):
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
+
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
        self.c_attn = Conv1D(n_state * 3, 1, nx)
        self.c_proj = Conv1D(n_state, 1, nx)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

-    def _attn(self, q, k, v):
+    def prune_heads(self, heads):
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
@ -278,6 +298,13 @@ class Attention(nn.Module):

        w = nn.Softmax(dim=-1)(w)
        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
        return torch.matmul(w, v)

    def merge_heads(self, x):
@ -293,16 +320,25 @@ class Attention(nn.Module):
        else:
            return x.permute(0, 2, 1, 3)

-    def forward(self, x):
+    def forward(self, x, head_mask=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
-        a = self._attn(query, key, value)
+
+        a = self._attn(query, key, value, head_mask)
+        if self.keep_multihead_output:
+            self.multihead_output = a
+            self.multihead_output.retain_grad()
+
+        if self.output_attentions:
+            attentions, a = a
        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a
        return a


@ -322,19 +358,24 @@ class MLP(nn.Module):


 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
        super(Block, self).__init__()
        nx = config.n_embd
-        self.attn = Attention(nx, n_ctx, config, scale)
+        self.output_attentions = output_attentions
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)
        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)

-    def forward(self, x):
-        a = self.attn(x)
+    def forward(self, x, head_mask=None):
+        a = self.attn(x, head_mask=head_mask)
+        if self.output_attentions:
+            attentions, a = a
        n = self.ln_1(x + a)
        m = self.mlp(n)
        h = self.ln_2(n + m)
+        if self.output_attentions:
+            return attentions, h
        return h


@ -344,17 +385,21 @@ class OpenAIGPTLMHead(nn.Module):
    def __init__(self, model_embeddings_weights, config):
        super(OpenAIGPTLMHead, self).__init__()
        self.n_embd = config.n_embd
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights):
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
        embed_shape = model_embeddings_weights.shape
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
+        embed_shape = model_embeddings_weights.shape
        self.decoder.weight = model_embeddings_weights  # Tied weights

    def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
        lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
        return lm_logits


@ -364,7 +409,6 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
    def __init__(self, config):
        super(OpenAIGPTMultipleChoiceHead, self).__init__()
        self.n_embd = config.n_embd
-        # self.multiple_choice_token = multiple_choice_token
        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
        self.linear = nn.Linear(config.n_embd, 1)

@ -415,13 +459,8 @@ class OpenAIGPTPreTrainedModel(nn.Module):
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
        """
        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
@ -434,14 +473,20 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                    . `openai_gpt_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `openai-gpt-config.json` a configuration file for the model
                    . a series of NumPy files containing OpenAI TensorFlow trained weights
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@ -451,16 +496,37 @@ class OpenAIGPTPreTrainedModel(nn.Module):
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                    archive_file, config_file
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
                )
-            )
            return None
        if resolved_archive_file == archive_file and resolved_config_file == config_file:
            logger.info("loading weights file {}".format(archive_file))
@ -560,7 +626,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@ -572,10 +641,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            (the previous two being the word and position embeddings).
            The input, position and token_type embeddings are summed inside the Transformer before the first
            self-attention block.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)

    Example usage:
@ -590,17 +661,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(OpenAIGPTModel, self).__init__(config)
-        num_tokens = config.vocab_size + config.n_special
-        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.output_attentions = output_attentions
+        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
+                                                        keep_multihead_output=keep_multihead_output)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])

        self.apply(self.init_weights)
-        # nn.init.normal_(self.embed.weight, std=0.02)

    def set_num_special_tokens(self, num_special_tokens):
        " Update input embeddings with new embedding matrice if needed "
@ -616,7 +687,20 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        # Copy word embeddings from the previous weights
        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]

-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [h.attn.multihead_output for h in self.h]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
        if position_ids is None:
            # This was used when we had a single embedding matrice from position and token embeddings
            # start = self.config.vocab_size + self.config.n_special
@ -625,6 +709,21 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we mask the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.n_layer
+
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
@ -636,13 +735,25 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            token_type_embeds = self.tokens_embed(token_type_ids)
        else:
            token_type_embeds = 0
-        # Add the position information to the input embeddings
-        # h = e.sum(dim=2)
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        for block in self.h:
-            hidden_states = block(hidden_states)
+        hidden_states = self.drop(hidden_states)
+
        output_shape = input_shape + (hidden_states.size(-1),)
-        return hidden_states.view(*output_shape)
+
+        all_attentions = []
+        all_hidden_states = [hidden_states.view(*output_shape)]
+        for i, block in enumerate(self.h):
+            outputs = block(hidden_states, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states = outputs
+                all_attentions.append(attentions)
+            else:
+                hidden_states = outputs
+            all_hidden_states.append(hidden_states.view(*output_shape))
+
+        if self.output_attentions:
+            return all_attentions, all_hidden_states
+        return all_hidden_states


 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
@ -666,7 +777,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@ -681,6 +795,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` is not `None`:
@ -701,21 +817,27 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
        self.apply(self.init_weights)

-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
        """ Update input and output embeddings with new embedding matrice
            Make sure we are sharing the embeddings
        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]

-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
        lm_logits = self.lm_head(hidden_states)
        if lm_labels is not None:
            # Shift so that tokens < n predict n
@ -726,6 +848,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                            shift_labels.view(-1))
            return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits
        return lm_logits


@ -750,7 +874,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@ -769,6 +896,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            is only computed for the labels set in [0, ..., total_tokens_embeddings]
        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` and `multiple_choice_labels` are not `None`:
@ -785,27 +914,34 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):

    config = modeling_openai.OpenAIGPTConfig()

-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
    ```
    """

-    def __init__(self, config):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
+                                             keep_multihead_output=keep_multihead_output)
        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
        self.apply(self.init_weights)

-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
        """ Update input and output embeddings with new embedding matrice
            Make sure we are sharing the embeddings
        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, head_mask=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        hidden_states = hidden_states[-1]

-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
        lm_logits = self.lm_head(hidden_states)
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
        losses = []
@ -819,4 +955,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
        if losses:
            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
        return lm_logits, mc_logits
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@ -25,9 +25,6 @@ import copy
 import json
 import math
 import logging
-import tarfile
-import tempfile
-import shutil
 import collections
 import sys
 from io import open
@ -888,8 +885,7 @@ class TransfoXLPreTrainedModel(nn.Module):
        pass

    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        from_tf=False, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        """
        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
@ -897,19 +893,25 @@ class TransfoXLPreTrainedModel(nn.Module):
        Params:
            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `transfo-xl`
+                    . `transfo-xl-wt103`
                - a path or url to a pretrained model archive containing:
                    . `transfo_xl_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
+                    . `transfo_xl_config.json` a configuration file for the model
                    . `model.chkpt` a TensorFlow checkpoint
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
+            *inputs, **kwargs: additional input for the specific TransformerXL class
        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
@ -919,16 +921,37 @@ class TransfoXLPreTrainedModel(nn.Module):
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        archive_file
+                    )
+                )
+            return None
+        try:
            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    archive_file, config_file))
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find file {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                        config_file
+                    )
+                )
            return None
        if resolved_archive_file == archive_file and resolved_config_file == config_file:
            logger.info("loading weights file {}".format(archive_file))
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@ -114,10 +114,10 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                        self.out_layers[0].bias, self.out_projs[0])
            if target is not None:
-                output = -F.log_softmax(logit, dim=-1) \
+                out = -F.log_softmax(logit, dim=-1) \
                        .gather(1, target.unsqueeze(1)).squeeze(1)
            else:
-                output = F.log_softmax(logit, dim=-1)
+                out = F.log_softmax(logit, dim=-1)
        else:
            # construct weights and biases
            weights, biases = [], []
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@ -34,6 +34,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
    'bert-base-uncased': 512,
@ -43,6 +46,9 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
    'bert-base-multilingual-uncased': 512,
    'bert-base-multilingual-cased': 512,
    'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
 }
 VOCAB_NAME = 'vocab.txt'

@ -175,13 +181,18 @@ class BertTokenizer(object):
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@ -37,9 +37,11 @@ logger = logging.getLogger(__name__)

 PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
 }
 PRETRAINED_MERGES_ARCHIVE_MAP = {
    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
    'gpt2': 1024,
@ -91,7 +93,7 @@ class GPT2Tokenizer(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Instantiate a GPT2Tokenizer from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
@ -111,14 +113,19 @@ class GPT2Tokenizer(object):
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
            return None
        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
@ -263,9 +270,14 @@ class GPT2Tokenizer(object):
    def encode(self, text):
        return self.convert_tokens_to_ids(self.tokenize(text))

-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
+    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        if clean_up_tokenization_spaces:
+            text = text.replace('<unk>', '')
+            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
        return text

    def save_vocabulary(self, vocab_path):
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@ -101,14 +101,19 @@ class OpenAIGPTTokenizer(object):
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
            return None
        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
@ -272,7 +277,7 @@ class OpenAIGPTTokenizer(object):
        out_string = ''.join(tokens).replace('</w>', ' ').strip()
        if clean_up_tokenization_spaces:
            out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
        return out_string
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@ -71,14 +71,19 @@ class TransfoXLTokenizer(object):
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file))
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@ -41,6 +41,7 @@ class GPT2ModelTest(unittest.TestCase):
                     use_token_type_ids=True,
                     use_labels=True,
                     vocab_size=99,
+                     n_special=1,
                     n_positions=33,
                     n_embd=32,
                     n_layer=5,
@ -58,6 +59,7 @@ class GPT2ModelTest(unittest.TestCase):
            self.use_token_type_ids = use_token_type_ids
            self.use_labels = use_labels
            self.vocab_size = vocab_size
+            self.n_special = n_special
            self.n_positions = n_positions
            self.n_embd = n_embd
            self.n_layer = n_layer
@ -69,7 +71,8 @@ class GPT2ModelTest(unittest.TestCase):
            self.scope = scope

        def prepare_config_and_inputs(self):
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+            total_num_tokens = self.vocab_size + self.n_special
+            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)

            position_ids = None
            if self.use_position_ids:
@ -90,6 +93,7 @@ class GPT2ModelTest(unittest.TestCase):

            config = GPT2Config(
                vocab_size_or_config_json_file=self.vocab_size,
+                n_special=self.n_special,
                n_positions=self.n_positions,
                n_embd=self.n_embd,
                n_layer=self.n_layer,
@ -111,8 +115,9 @@ class GPT2ModelTest(unittest.TestCase):
            return outputs

        def check_gpt2_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
            self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])


@ -129,11 +134,29 @@ class GPT2ModelTest(unittest.TestCase):
            }
            return outputs

+        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2LMHeadModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
        def check_gpt2_lm_head_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertEqual(self.n_layer, len(result["presents"]))
+            self.parent.assertListEqual(
+                list(result["presents"][0].size()),
+                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])

        def check_gpt2_lm_head_loss_output(self, result):
            self.parent.assertListEqual(
@ -156,8 +179,25 @@ class GPT2ModelTest(unittest.TestCase):
            }
            return outputs

+        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2DoubleHeadsModel(config, output_attentions=True)
+            model.eval()
+            loss = model(input_ids, mc_token_ids,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits,
+                "presents": presents,
+                "attentions": attentions,
+            }
+            return outputs
+
        def check_gpt2_double_heads_output(self, result):
-            total_voc = self.vocab_size
+            total_voc = self.n_special + self.vocab_size
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
                [self.batch_size, self.n_choices, self.seq_length, total_voc])
@ -170,6 +210,98 @@ class GPT2ModelTest(unittest.TestCase):
                [list(l.size()) for l in result["loss"]],
                [[], []])

+        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                   mc_labels, lm_labels, mc_token_ids):
+            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, GPT2Model) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, GPT2DoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                if isinstance(model, GPT2Model):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output[:-1])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
    def test_default(self):
        self.run_tester(GPT2ModelTest.GPT2ModelTester(self))

@ -208,6 +340,9 @@ class GPT2ModelTest(unittest.TestCase):
        tester.check_gpt2_double_heads_output(output_result)
        tester.check_gpt2_double_heads_loss_output(output_result)

+        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
+        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
+
    @classmethod
    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
        """Creates a random int32 tensor of the shape within the vocab size."""
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@ -125,8 +125,9 @@ class OpenAIGPTModelTest(unittest.TestCase):
            return outputs

        def check_openai_model_output(self, result):
+            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
            self.parent.assertListEqual(
-                list(result["hidden_states"].size()),
+                list(result["hidden_states"][0].size()),
                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])


@ -182,6 +183,99 @@ class OpenAIGPTModelTest(unittest.TestCase):
                [list(l.size()) for l in result["loss"]],
                [[], []])

+        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
+                                                mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
+                else:
+                    output = model(input_ids, head_mask=head_mask)
+
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                     self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
+
+
+        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
+                                                     mc_labels, lm_labels, mc_token_ids):
+            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
+                model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
+                heads_to_prune = {0: list(range(1, self.n_head)),
+                                  -1: [0]}
+                transformer.prune_heads(heads_to_prune)
+                if isinstance(model, OpenAIGPTDoubleHeadsModel):
+                    output = model(input_ids, mc_token_ids)
+                else:
+                    output = model(input_ids)
+
+                if isinstance(model, OpenAIGPTModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = transformer.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size * self.n_choices, 1,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size * self.n_choices, self.n_head,
+                        self.seq_length, self.n_embd // self.n_head])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size * self.n_choices, self.n_head-1,
+                        self.seq_length, self.n_embd // self.n_head])
+
+
    def test_default(self):
        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))

@ -220,6 +314,9 @@ class OpenAIGPTModelTest(unittest.TestCase):
        tester.check_openai_double_heads_output(output_result)
        tester.check_openai_double_heads_loss_output(output_result)

+        tester.create_and_check_openai_for_headmasking(*config_and_inputs)
+        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
+
    @classmethod
    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
        """Creates a random int32 tensor of the shape within the vocab size."""
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@ -28,7 +28,7 @@ import torch
 from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForNextSentencePrediction, BertForPreTraining,
                                     BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification)
+                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP


@ -56,6 +56,7 @@ class BertModelTest(unittest.TestCase):
                     type_sequence_label_size=2,
                     initializer_range=0.02,
                     num_labels=3,
+                     num_choices=4,
                     scope=None):
            self.parent = parent
            self.batch_size = batch_size
@ -77,6 +78,7 @@ class BertModelTest(unittest.TestCase):
            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
            self.num_labels = num_labels
+            self.num_choices = num_choices
            self.scope = scope

        def prepare_config_and_inputs(self):
@ -92,9 +94,11 @@ class BertModelTest(unittest.TestCase):

            sequence_labels = None
            token_labels = None
+            choice_labels = None
            if self.use_labels:
                sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = BertModelTest.ids_tensor([self.batch_size], self.num_choices)

            config = BertConfig(
                vocab_size_or_config_json_file=self.vocab_size,
@ -109,14 +113,14 @@ class BertModelTest(unittest.TestCase):
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels

        def check_loss_output(self, result):
            self.parent.assertListEqual(
                list(result["loss"].size()),
                [])

-        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertModel(config=config)
            model.eval()
            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
@ -137,7 +141,7 @@ class BertModelTest(unittest.TestCase):
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])


-        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForMaskedLM(config=config)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
@ -153,7 +157,7 @@ class BertModelTest(unittest.TestCase):
                list(result["prediction_scores"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])

-        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForNextSentencePrediction(config=config)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@ -170,7 +174,7 @@ class BertModelTest(unittest.TestCase):
                [self.batch_size, 2])


-        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForPreTraining(config=config)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
@ -191,7 +195,7 @@ class BertModelTest(unittest.TestCase):
                [self.batch_size, 2])


-        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForQuestionAnswering(config=config)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
@ -212,7 +216,7 @@ class BertModelTest(unittest.TestCase):
                [self.batch_size, self.seq_length])


-        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
@ -229,7 +233,7 @@ class BertModelTest(unittest.TestCase):
                [self.batch_size, self.num_labels])


-        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForTokenClassification(config=config, num_labels=self.num_labels)
            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
@ -246,6 +250,150 @@ class BertModelTest(unittest.TestCase):
                [self.batch_size, self.seq_length, self.num_labels])


+        def create_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForMultipleChoice(config=config, num_choices=self.num_choices)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            loss = model(multiple_choice_inputs_ids,
+                         multiple_choice_token_type_ids,
+                         multiple_choice_input_mask,
+                         choice_labels)
+            logits = model(multiple_choice_inputs_ids,
+                           multiple_choice_token_type_ids,
+                           multiple_choice_input_mask)
+            outputs = {
+                "loss": loss,
+                "logits": logits,
+            }
+            return outputs
+
+        def check_bert_for_multiple_choice(self, result):
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_choices])
+
+
+        def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config, num_labels=self.num_labels, output_attentions=True)
+                else:
+                    model = model_class(config=config, output_attentions=True)
+                model.eval()
+                output = model(input_ids, token_type_ids, input_mask)
+                attentions = output[0]
+                self.parent.assertEqual(len(attentions), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(attentions[0].size()),
+                    [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
+
+
+        def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                head_mask = torch.zeros(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
+                head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
+                head_mask[-1, 1:] = 1.0  # Mask all but the first head on the last layer
+                output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                self.parent.assertEqual(
+                    len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[1].nonzero()),
+                    multihead_outputs[1].numel())
+
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                    0)
+                self.parent.assertEqual(
+                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+
+        def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                                BertForTokenClassification):
+                if model_class in [BertForSequenceClassification,
+                                   BertForTokenClassification]:
+                    model = model_class(config=config,
+                                        num_labels=self.num_labels,
+                                        keep_multihead_output=True)
+                else:
+                    model = model_class(config=config, keep_multihead_output=True)
+                model.eval()
+                bert_model = model if isinstance(model, BertModel) else model.bert
+                heads_to_prune = {0: list(range(1, self.num_attention_heads)),
+                                  -1: [0]}
+                bert_model.prune_heads(heads_to_prune)
+                output = model(input_ids, token_type_ids, input_mask)
+
+                if isinstance(model, BertModel):
+                    output = sum(t.sum() for t in output[0])
+                elif isinstance(output, (list, tuple)):
+                    output = sum(t.sum() for t in output)
+                output = output.sum()
+                output.backward()
+                multihead_outputs = bert_model.get_multihead_outputs()
+
+                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                self.parent.assertListEqual(
+                    list(multihead_outputs[0].size()),
+                    [self.batch_size, 1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[1].size()),
+                    [self.batch_size, self.num_attention_heads,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                self.parent.assertListEqual(
+                    list(multihead_outputs[-1].size()),
+                    [self.batch_size, self.num_attention_heads-1,
+                     self.seq_length, self.hidden_size // self.num_attention_heads])
+
+
    def test_default(self):
        self.run_tester(BertModelTest.BertModelTester(self))

@ -300,6 +448,14 @@ class BertModelTest(unittest.TestCase):
        tester.check_bert_for_token_classification_output(output_result)
        tester.check_loss_output(output_result)

+        output_result = tester.create_bert_for_multiple_choice(*config_and_inputs)
+        tester.check_bert_for_multiple_choice(output_result)
+        tester.check_loss_output(output_result)
+
+        tester.create_and_check_bert_for_attentions(*config_and_inputs)
+        tester.create_and_check_bert_for_headmasking(*config_and_inputs)
+        tester.create_and_check_bert_for_head_pruning(*config_and_inputs)
+
    @classmethod
    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
        """Creates a random int32 tensor of the shape within the vocab size."""