diff --git a/.circleci/config.yml b/.circleci/config.yml
index f42e3023afb..1bb3fd0877f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,33 +1,77 @@
 version: 2
 jobs:
-    build_py3:
-        working_directory: ~/pytorch-transformers
+    build_py3_torch_and_tf:
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         resource_class: xlarge
         parallelism: 1
         steps:
             - checkout
+            - run: sudo pip install torch
+            - run: sudo pip install tensorflow==2.0.0-rc0
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py3_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install torch
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
-    build_py2:
-        working_directory: ~/pytorch-transformers
+    build_py3_tf:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install tensorflow==2.0.0-rc0
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py2_torch:
+        working_directory: ~/transformers
         resource_class: large
         parallelism: 1
         docker:
             - image: circleci/python:2.7
         steps:
             - checkout
+            - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py2_tf:
+        working_directory: ~/transformers
+        resource_class: large
+        parallelism: 1
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install tensorflow==2.0.0-rc0
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     deploy_doc:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         steps:
@@ -48,6 +92,9 @@ workflows:
     version: 2
     build_and_test:
         jobs:
-            - build_py3
-            - build_py2
+            - build_py3_torch_and_tf
+            - build_py3_torch
+            - build_py3_tf
+            - build_py2_torch
+            - build_py2_tf
             - deploy_doc: *workflow_filters
\ No newline at end of file
diff --git a/.coveragerc b/.coveragerc
index fa6c165a8a7..9a1103b8af3 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_transformers
+source=transformers
 omit =
     # skip convertion scripts from testing for now
     */convert_*
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
index cf0c9a47576..8ce1bc8fdd5 100644
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
 ---
 
 ## 📚 Migration
diff --git a/.gitignore b/.gitignore
index d285d0ded93..e673ce5f47b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,5 +130,5 @@ runs
 examples/runs
 
 # data
-data
+/data
 serialization_dir
\ No newline at end of file
diff --git a/README.md b/README.md
index 9187250c194..2a88aaeaf7b 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,61 @@
-# 👾 PyTorch-Transformers
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformer?style=flat-square">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue&style=flat-square">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&style=flat-square&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg?style=flat-square">
+    </a>
+</p>
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+State-of-the-art Natural Language Processing (NLP) for TensorFlow 2.0 and PyTorch.
 
-PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures (BERT, GPT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with more than 32+ pretrained checkpoints in 100+ languages.
 
-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+Features
+- As easy to use as pytorch-transformers
+- As powerful and concise as Keras
+- High performance on NLU and NLG tasks
+- Low barrier to entry for educators and practitioners
 
-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+State-of-the-art NLP for everyone
+- Deep learning researchers
+- Hands-on practitioners
+- AI/ML/NLP teachers and educators
+
+Lower compute costs, smaller carbon footprint
+- Researchers can share trained models instead of always retraining
+- Practitioners can reduce compute time and production costs
+- 8 architectures with over 30 pretrained models, some in more than 100 languages
+
+Choose the right framework for every part of a model's lifetime
+- Train state-of-the-art models in 3 lines of code
+- Deep interoperability between TensorFlow 2.0 and PyTorch models
+- Move a single model between TF2.0/PyTorch frameworks at will
+- Seamlessly pick the right framework for training, evaluation, production
 
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
 
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
+| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
+| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-2.0-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
-| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
+| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
 
 ## Installation
 
@@ -33,10 +63,10 @@ This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3
 
 ### With pip
 
-PyTorch-Transformers can be installed by pip as follows:
+Transformers can be installed by pip as follows:
 
 ```bash
-pip install pytorch-transformers
+pip install transformers
 ```
 
 ### From source
@@ -49,14 +79,14 @@ pip install [--editable] .
 
 ### Tests
 
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
+A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
 These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
 You can run the tests from the root of the cloned repository with the commands:
 
 ```bash
-python -m pytest -sv ./pytorch_transformers/tests/
+python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
@@ -69,6 +99,22 @@ It contains an example of a conversion script from a Pytorch trained Transformer
 At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
 or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
 
+## Model architectures
+
+🤗 Transformers currently provides 8 NLU/NLG architectures:
+
+1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
+) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+
 ## Online demo
 
 **[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
@@ -80,22 +126,25 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
 
 ## Quick tour
 
-Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
+Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
 
 ```python
 import torch
-from pytorch_transformers import *
+from transformers import *
 
-# PyTorch-Transformers has a unified API
-# for 7 transformer architectures and 30 pretrained weights.
+# Transformers has a unified API
+# for 8 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
-          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
-          (RobertaModel,    RobertaTokenizer,   'roberta-base')]
+MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
+          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
+          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
+          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
+          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
+          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
+          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
+
+# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
 
 # Let's encode some text in a sequence of hidden-states using each model:
 for model_class, tokenizer_class, pretrained_weights in MODELS:
@@ -141,6 +190,53 @@ tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
 # SOTA examples for GLUE, SQUAD, text generation...
 ```
 
+## Quick tour TF 2.0 training and PyTorch interoperability
+
+Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
+
+```python
+import tensorflow as tf
+import tensorflow_datasets
+from pytorch_transformers import *
+
+# Load dataset, tokenizer, model from pretrained model/vocabulary
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+data = tensorflow_datasets.load('glue/mrpc')
+
+# Prepare dataset for GLUE as a tf.data.Dataset instance
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+valid_dataset = valid_dataset.batch(64)
+
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+# Train and evaluate using tf.keras.Model.fit()
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
+                    validation_data=valid_dataset, validation_steps=7)
+
+# Load the TensorFlow model in PyTorch for inspection
+model.save_pretrained('./save/')
+pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+sentence_0 = "This research was consistent with his findings."
+sentence_1 = "His findings were compatible with this research."
+sentence_2 = "His findings were not compatible with this research."
+inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+
+pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
+```
+
 ## Quick tour of the fine-tuning/usage scripts
 
 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
@@ -299,19 +395,32 @@ python ./examples/run_generation.py \
     --model_name_or_path=gpt2 \
 ```
 
-## Migrating from pytorch-pretrained-bert to pytorch-transformers
+## Migrating from pytorch-transformers to transformers
 
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+
+## Migrating from pytorch-pretrained-bert to transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 
 ```python
 # Let's load our model
@@ -320,11 +429,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
 
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
 
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -339,7 +448,7 @@ Breaking change in the `from_pretrained()`method:
 
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 
@@ -396,7 +505,7 @@ for batch in train_data:
     loss.backward()
     optimizer.step()
 
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
@@ -411,4 +520,4 @@ for batch in train_data:
 
 ## Citation
 
-At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated to Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1a6c6f06f94..fed834ff88e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 
-RUN pip install pytorch_transformers
+RUN pip install transformers
 
 WORKDIR /workspace
\ No newline at end of file
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 4adf2a4672d..1d5827c0fa8 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -16,7 +16,7 @@ function addIcon() {
 function addCustomFooter() {
     const customFooter = document.createElement("div");
     const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/pytorch_transformers'>Create an issue</a>";
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
     customFooter.appendChild(questionOrIssue);
     customFooter.classList.add("footer");
 
diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst
index 0ea5b53d807..c3d1b2f8b83 100644
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
 
-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c847dee8066..ae9cc67e2c1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,7 +19,7 @@ sys.path.insert(0, os.path.abspath('../..'))
 
 # -- Project information -----------------------------------------------------
 
-project = u'pytorch-transformers'
+project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'
 
@@ -109,7 +109,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pytorch-transformersdoc'
+htmlhelp_basename = 'transformersdoc'
 
 
 # -- Options for LaTeX output ------------------------------------------------
@@ -136,7 +136,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers.tex', u'transformers Documentation',
      u'huggingface', 'manual'),
 ]
 
@@ -146,7 +146,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
      [author], 1)
 ]
 
@@ -157,8 +157,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
-     author, 'pytorch-transformers', 'One line description of project.',
+    (master_doc, 'transformers', u'transformers Documentation',
+     author, 'transformers', 'One line description of project.',
      'Miscellaneous'),
 ]
 
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index 8441c9b1f71..8d805b0ed11 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^
 
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
 
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
 
    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-   pytorch_transformers bert \
+   transformers bert \
      $BERT_BASE_DIR/bert_model.ckpt \
      $BERT_BASE_DIR/bert_config.json \
      $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 
    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-   pytorch_transformers gpt \
+   transformers gpt \
      $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
 
    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
 
-   pytorch_transformers gpt2 \
+   transformers gpt2 \
      $OPENAI_GPT2_CHECKPOINT_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT2_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 
    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-   pytorch_transformers transfo_xl \
+   transformers transfo_xl \
      $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [TRANSFO_XL_CONFIG]
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-   pytorch_transformers xlnet \
+   transformers xlnet \
      $TRANSFO_XL_CHECKPOINT_PATH \
      $TRANSFO_XL_CONFIG_PATH \
      $PYTORCH_DUMP_OUTPUT \
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
 
    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
 
-   pytorch_transformers xlm \
+   transformers xlm \
      $XLM_CHECKPOINT_PATH \
      $PYTORCH_DUMP_OUTPUT \
diff --git a/docs/source/imgs/transformers_logo_name.png b/docs/source/imgs/transformers_logo_name.png
new file mode 100644
index 00000000000..5e4c2dcf575
Binary files /dev/null and b/docs/source/imgs/transformers_logo_name.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5b451707d60..a205b0b3147 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,7 @@
-Pytorch-Transformers
+Transformers
 ================================================================================================================================================
 
-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
 
 The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
@@ -12,7 +12,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 6512a0cef3b..51f7eb520d8 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,7 +1,7 @@
 Installation
 ================================================
 
-PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 
 With pip
 ^^^^^^^^
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
 
 .. code-block:: bash
 
-   pip install pytorch-transformers
+   pip install transformers
 
 From source
 ^^^^^^^^^^^
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
 
 .. code-block:: bash
 
-    git clone https://github.com/huggingface/pytorch-transformers.git
-    cd pytorch-transformers
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
     pip install [--editable] .
 
 
 Tests
 ^^^^^
 
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
 
 Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
 
 .. code-block:: bash
 
-    python -m pytest -sv ./pytorch_transformers/tests/
+    python -m pytest -sv ./transformers/tests/
     python -m pytest -sv ./examples/
 
 
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
index 5181874c1a2..2131433759c 100644
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PretrainedConfig
+.. autoclass:: transformers.PretrainedConfig
     :members:
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
index ba61afadf09..d22467f9072 100644
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -11,5 +11,5 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PreTrainedModel
+.. autoclass:: transformers.PreTrainedModel
     :members:
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
index 70fefb7c6d5..ff0c9e6929c 100644
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
 ``AdamW``
 ~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AdamW
+.. autoclass:: transformers.AdamW
     :members:
 
 Schedules
@@ -18,11 +18,11 @@ Schedules
 Learning Rate Schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autoclass:: pytorch_transformers.ConstantLRSchedule
+.. autoclass:: transformers.ConstantLRSchedule
     :members:
 
 
-.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+.. autoclass:: transformers.WarmupConstantSchedule
     :members:
 
 .. image:: /imgs/warmup_constant_schedule.png
@@ -30,7 +30,7 @@ Learning Rate Schedules
     :alt:
 
 
-.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+.. autoclass:: transformers.WarmupCosineSchedule
     :members:
 
 .. image:: /imgs/warmup_cosine_schedule.png
@@ -38,7 +38,7 @@ Learning Rate Schedules
     :alt:
 
 
-.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
     :members:
 
 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,7 +47,7 @@ Learning Rate Schedules
 
 
 
-.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+.. autoclass:: transformers.WarmupLinearSchedule
     :members:
 
 .. image:: /imgs/warmup_linear_schedule.png
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
index 12ca5522dea..c33eb458292 100644
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+.. autoclass:: transformers.PreTrainedTokenizer
     :members:
diff --git a/docs/source/migration.md b/docs/source/migration.md
index 9cfcaade138..553a79c82b0 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,17 @@
 # Migrating from pytorch-pretrained-bert
 
 
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 
 ```python
 # Let's load our model
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
 
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
 
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -96,7 +96,7 @@ for batch in train_data:
     loss.backward()
     optimizer.step()
 
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index 7b56eabafe6..4b900d8e557 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoConfig
+.. autoclass:: transformers.AutoConfig
     :members:
 
 
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoModel
+.. autoclass:: transformers.AutoModel
     :members:
 
 
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoTokenizer
+.. autoclass:: transformers.AutoTokenizer
     :members:
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index cbce74e73bf..9f2caf3e80d 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,69 +4,69 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertConfig
+.. autoclass:: transformers.BertConfig
     :members:
 
 
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertTokenizer
+.. autoclass:: transformers.BertTokenizer
     :members:
 
 
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertModel
+.. autoclass:: transformers.BertModel
     :members:
 
 
 ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForPreTraining
+.. autoclass:: transformers.BertForPreTraining
     :members:
 
 
 ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForMaskedLM
+.. autoclass:: transformers.BertForMaskedLM
     :members:
 
 
 ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+.. autoclass:: transformers.BertForNextSentencePrediction
     :members:
 
 
 ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForSequenceClassification
+.. autoclass:: transformers.BertForSequenceClassification
     :members:
 
 
 ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForMultipleChoice
+.. autoclass:: transformers.BertForMultipleChoice
     :members:
 
 
 ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForTokenClassification
+.. autoclass:: transformers.BertForTokenClassification
     :members:
 
 
 ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+.. autoclass:: transformers.BertForQuestionAnswering
     :members:
 
diff --git a/docs/source/model_doc/distilbert.rst b/docs/source/model_doc/distilbert.rst
index 141d3e151ff..de1ac736750 100644
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -4,40 +4,40 @@ DistilBERT
 ``DistilBertConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertConfig
+.. autoclass:: transformers.DistilBertConfig
     :members:
 
 
 ``DistilBertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertTokenizer
+.. autoclass:: transformers.DistilBertTokenizer
     :members:
 
 
 ``DistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertModel
+.. autoclass:: transformers.DistilBertModel
     :members:
 
 
 ``DistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
+.. autoclass:: transformers.DistilBertForMaskedLM
     :members:
 
 
 ``DistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
+.. autoclass:: transformers.DistilBertForSequenceClassification
     :members:
 
 
 ``DistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
+.. autoclass:: transformers.DistilBertForQuestionAnswering
     :members:
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 26762ae011f..39995a98fc3 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,33 +4,33 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+.. autoclass:: transformers.OpenAIGPTConfig
     :members:
 
 
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+.. autoclass:: transformers.OpenAIGPTTokenizer
     :members:
 
 
 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTModel
+.. autoclass:: transformers.OpenAIGPTModel
     :members:
 
 
 ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+.. autoclass:: transformers.OpenAIGPTLMHeadModel
     :members:
 
 
 ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index a49d1b42582..92decb14de8 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,33 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Config
+.. autoclass:: transformers.GPT2Config
     :members:
 
 
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Tokenizer
+.. autoclass:: transformers.GPT2Tokenizer
     :members:
 
 
 ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Model
+.. autoclass:: transformers.GPT2Model
     :members:
 
 
 ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+.. autoclass:: transformers.GPT2LMHeadModel
     :members:
 
 
 ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+.. autoclass:: transformers.GPT2DoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
index e2de917e352..5351e018cd7 100644
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -4,33 +4,33 @@ RoBERTa
 ``RobertaConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaConfig
+.. autoclass:: transformers.RobertaConfig
     :members:
 
 
 ``RobertaTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaTokenizer
+.. autoclass:: transformers.RobertaTokenizer
     :members:
 
 
 ``RobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaModel
+.. autoclass:: transformers.RobertaModel
     :members:
 
 
 ``RobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+.. autoclass:: transformers.RobertaForMaskedLM
     :members:
 
 
 ``RobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+.. autoclass:: transformers.RobertaForSequenceClassification
     :members:
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 88cca450ee6..c8a9cc7d993 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,26 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: transformers.TransfoXLConfig
     :members:
 
 
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+.. autoclass:: transformers.TransfoXLTokenizer
     :members:
 
 
 ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLModel
+.. autoclass:: transformers.TransfoXLModel
     :members:
 
 
 ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+.. autoclass:: transformers.TransfoXLLMHeadModel
     :members:
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 217952ea5e2..344371ad52f 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -4,38 +4,38 @@ XLM
 ``XLMConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMConfig
+.. autoclass:: transformers.XLMConfig
     :members:
 
 ``XLMTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMTokenizer
+.. autoclass:: transformers.XLMTokenizer
     :members:
 
 ``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMModel
+.. autoclass:: transformers.XLMModel
     :members:
 
 
 ``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+.. autoclass:: transformers.XLMWithLMHeadModel
     :members:
 
 
 ``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+.. autoclass:: transformers.XLMForSequenceClassification
     :members:
 
 
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+.. autoclass:: transformers.XLMForQuestionAnswering
     :members:
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index e388934c568..a4715062783 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -4,40 +4,40 @@ XLNet
 ``XLNetConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetConfig
+.. autoclass:: transformers.XLNetConfig
     :members:
 
 
 ``XLNetTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetTokenizer
+.. autoclass:: transformers.XLNetTokenizer
     :members:
 
 
 ``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetModel
+.. autoclass:: transformers.XLNetModel
     :members:
 
 
 ``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+.. autoclass:: transformers.XLNetLMHeadModel
     :members:
 
 
 ``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+.. autoclass:: transformers.XLNetForSequenceClassification
     :members:
 
 
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+.. autoclass:: transformers.XLNetForQuestionAnswering
     :members:
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
index 7e214fa00a5..fe669e8e47f 100644
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +1,16 @@
 Notebooks
 ================================================
 
-We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
 
 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 
 Please follow the instructions given in the notebooks to run and modify them.
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index d6e273797f4..0e55767d761 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                   |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
-.. <https://huggingface.co/pytorch-transformers/examples.html>`__
\ No newline at end of file
+.. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index f037a95a3a9..c7ad5c79304 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -2,7 +2,7 @@
 
 ## Philosophy
 
-PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
 
 The library was designed with two strong goals in mind:
 
@@ -39,7 +39,7 @@ The library is build around three type of classes for each models:
 
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
 
-- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 
 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
 
 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from transformers import BertTokenizer, BertModel, BertForMaskedLM
 
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
@@ -106,7 +106,7 @@ model.to('cuda')
 with torch.no_grad():
     # See the models docstrings for the detail of the inputs
     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
+    # Transformers models always output tuples.
     # See the models docstrings for the detail of all the outputs
     # In our case, the first element is the hidden state of the last layer of the Bert model
     encoded_layers = outputs[0]
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index 7117d7ffa68..0b0b600ec1e 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -45,7 +45,7 @@ where
     * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
     * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
 
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 
 *
   ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
 
 .. code-block:: python
 
-   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+   from transformers import WEIGHTS_NAME, CONFIG_NAME
 
    output_dir = "./models/"
 
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index 5811572c073..fd1eeb53635 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.
 
-We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.
 
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 
 .. code-block:: python
 
-    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
+    from transformers import BertModel, BertTokenizer, BertConfig
     import torch
 
     enc = BertTokenizer.from_pretrained("bert-base-uncased")
diff --git a/examples/README.md b/examples/README.md
index a41c117078a..fb5de20a2a1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ similar API between the different models.
 
 ## Language model fine-tuning
 
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
 
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
 
 ## Language generation
 
-Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
@@ -91,7 +91,7 @@ python run_generation.py \
 
 ## GLUE
 
-Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
 
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
 
 ## SQuAD
 
-Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
 
 #### Fine-tuning on SQuAD
 
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index 1c9fba8ee83..661c1c305b7 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                      AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
                                      WarmupLinearSchedule)
 
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 495f40cec96..58aec25877e 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForMultipleChoice, BertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 logger = logging.getLogger(__name__)
 
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
             #     inputs.update({'cls_index': batch[5],
             #                    'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -647,7 +647,7 @@ def main():
 
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index 4c99777b982..f5375269b88 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 12d9165536f..4cddbd3a2ec 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -22,11 +22,11 @@ Here's the updated results on the dev sets of GLUE:
 
 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
 
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
 
 ## How to use DistilBERT
 
-PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index c22ee3b3978..79755b81e0c 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -29,7 +29,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
 
-from pytorch_transformers import WarmupLinearSchedule
+from transformers import WarmupLinearSchedule
 
 from utils import logger
 from dataset import Dataset
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index de9e39fff3b..eb4af08b0f4 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -20,7 +20,7 @@ import pickle
 import random
 import time
 import numpy as np
-from pytorch_transformers import BertTokenizer, RobertaTokenizer
+from transformers import BertTokenizer, RobertaTokenizer
 import logging
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/distillation/scripts/extract_for_distil.py b/examples/distillation/scripts/extract_for_distil.py
index 43554d1c9f2..2e7e5c73d8a 100644
--- a/examples/distillation/scripts/extract_for_distil.py
+++ b/examples/distillation/scripts/extract_for_distil.py
@@ -15,7 +15,7 @@
 """
 Preprocessing script before training DistilBERT.
 """
-from pytorch_transformers import BertForMaskedLM, RobertaForMaskedLM
+from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse
 
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 5cbb7e2dcde..f0255d08fe5 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -23,8 +23,8 @@ import shutil
 import numpy as np
 import torch
 
-from pytorch_transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
-from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
+from transformers import DistilBertForMaskedLM, DistilBertConfig
 
 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index f11b73b54f0..f37358359dd 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_transformers import (WEIGHTS_NAME,
+from transformers import (WEIGHTS_NAME,
                                   BertConfig, BertForSequenceClassification, BertTokenizer,
                                   XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                                   XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
diff --git a/examples/run_generation.py b/examples/run_generation.py
index a2a8f291031..9e98a9e8704 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -26,12 +26,12 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
 
-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
-from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from transformers import XLNetLMHeadModel, XLNetTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
 
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/run_glue.py b/examples/run_glue.py
index e20f6d84c46..71dad0edbfb 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForSequenceClassification, BertTokenizer,
                                   RobertaConfig,
                                   RobertaForSequenceClassification,
@@ -39,12 +39,17 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                   XLMConfig, XLMForSequenceClassification,
                                   XLMTokenizer, XLNetConfig,
                                   XLNetForSequenceClassification,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
+                                  DistilBertConfig,
+                                  DistilBertForSequenceClassification,
+                                  DistilBertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
-from utils_glue import (compute_metrics, convert_examples_to_features,
-                        output_modes, processors)
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
@@ -55,6 +60,7 @@ MODEL_CLASSES = {
     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 }
 
 
@@ -128,10 +134,10 @@ def train(args, train_dataset, model, tokenizer):
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
                       'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM, DistilBERT and RoBERTa don't use segment_ids
                       'labels':         batch[3]}
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -148,8 +154,8 @@ def train(args, train_dataset, model, tokenizer):
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
-                scheduler.step()  # Update learning rate schedule
                 optimizer.step()
+                scheduler.step()  # Update learning rate schedule
                 model.zero_grad()
                 global_step += 1
 
@@ -218,7 +224,7 @@ def evaluate(args, model, tokenizer, prefix=""):
             with torch.no_grad():
                 inputs = {'input_ids':      batch[0],
                           'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM, DistilBERT and RoBERTa don't use segment_ids
                           'labels':         batch[3]}
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
@@ -272,15 +278,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             # HACK(label indices are swapped in RoBERTa pretrained model)
             label_list[1], label_list[2] = label_list[2], label_list[1] 
         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                label_list=label_list,
+                                                max_length=args.max_seq_length,
+                                                output_mode=output_mode,
+                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -291,14 +296,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
 
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
     if output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
     elif output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
 
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
     return dataset
 
 
@@ -478,7 +483,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 4d14fe7ebb5..7ccf4c3cb7f 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -35,11 +35,12 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                   BertConfig, BertForMaskedLM, BertTokenizer,
                                   GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                   OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
+                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 
 
 logger = logging.getLogger(__name__)
@@ -49,7 +50,8 @@ MODEL_CLASSES = {
     'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
     'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
     'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 }
 
 
@@ -73,7 +75,7 @@ class TextDataset(Dataset):
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
             while len(tokenized_text) >= block_size:  # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
+                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
                 tokenized_text = tokenized_text[block_size:]
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
@@ -186,7 +188,7 @@ def train(args, train_dataset, model, tokenizer):
             labels = labels.to(args.device)
             model.train()
             outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -380,7 +382,7 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
     args = parser.parse_args()
 
-    if args.model_type in ["bert", "roberta"] and not args.mlm:
+    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
         raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                          "flag (masked language modeling).")
     if args.eval_data_file is None and args.do_eval:
@@ -479,7 +481,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 05f9a48f502..54f3a8a9040 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForMultipleChoice, BertTokenizer,
                                   XLNetConfig, XLNetForMultipleChoice,
                                   XLNetTokenizer, RobertaConfig,
                                   RobertaForMultipleChoice, RobertaTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 from utils_multiple_choice import (convert_examples_to_features, processors)
 
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
                       'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                       'labels':         batch[3]}
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -508,7 +508,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
@@ -524,7 +524,7 @@ def main():
         checkpoints = [args.output_dir]
         # if args.eval_all_checkpoints: # can not use this to do test!!
         #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-        #     logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_squad.py b/examples/run_squad.py
index cc4eda306cc..0c0fbf29636 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -32,14 +32,15 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForQuestionAnswering, BertTokenizer,
                                   XLMConfig, XLMForQuestionAnswering,
                                   XLMTokenizer, XLNetConfig,
                                   XLNetForQuestionAnswering,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 from utils_squad import (read_squad_examples, convert_examples_to_features,
                          RawResult, write_predictions,
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
     'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
     'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
     'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 }
 
 def set_seed(args):
@@ -140,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
                 inputs.update({'cls_index': batch[5],
                                'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -508,7 +510,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
new file mode 100644
index 00000000000..3a867f80a8e
--- /dev/null
+++ b/examples/run_tf_glue.py
@@ -0,0 +1,48 @@
+import tensorflow as tf
+import tensorflow_datasets
+from transformers import *
+
+# Load dataset, tokenizer, model from pretrained model/vocabulary
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+data = tensorflow_datasets.load('glue/mrpc')
+
+# Prepare dataset for GLUE as a tf.data.Dataset instance
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+valid_dataset = valid_dataset.batch(64)
+
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+# Train and evaluate using tf.keras.Model.fit()
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
+                    validation_data=valid_dataset, validation_steps=7)
+
+>>> Train for 115 steps, validate for 7 steps
+>>> Epoch 1/2
+>>> 115/115 [==============================] - 53s 459ms/step - loss: 0.6033 - accuracy: 0.6712 - val_loss: 0.4964 - val_accuracy: 0.7647
+>>> Epoch 2/2
+>>> 115/115 [==============================] - 33s 289ms/step - loss: 0.4141 - accuracy: 0.8160 - val_loss: 0.3914 - val_accuracy: 0.8382
+
+# Load the TensorFlow model in PyTorch for inspection
+model.save_pretrained('./save/')
+pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+sentence_0 = "This research was consistent with his findings."
+sentence_1 = "His findings were compatible with this research."
+sentence_2 = "His findings were not compatible with this research."
+inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+
+pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
+>>> sentence_1 is a paraphrase of sentence_0
+>>> sentence_2 is not a paraphrase of sentence_0
\ No newline at end of file
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 34a0c9cc02b..b990ecc8420 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
diff --git a/hubconf.py b/hubconf.py
index d9aaa6b53aa..3fa354ed5ad 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
-from pytorch_transformers import (
+from transformers import (
     AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.file_utils import add_start_docstrings
+from transformers.file_utils import add_start_docstrings
 
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
                 # Using torch.hub !
                 import torch
 
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                 assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                 assert config.output_attention == True
                 assert unused_kwargs == {'foo': False}
 
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
     """
 
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
deleted file mode 100644
index f12a5ebea7c..00000000000
--- a/pytorch_transformers/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-__version__ = "1.2.0"
-# Work around to update TensorFlow's absl.logging threshold which alters the
-# default Python logging output behavior when present.
-# see: https://github.com/abseil/abseil-py/issues/99
-# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
-try:
-    import absl.logging
-    absl.logging.set_verbosity('info')
-    absl.logging.set_stderrthreshold('info')
-    absl.logging._warn_preinit_stderr = False
-except:
-    pass
-
-# Tokenizer
-from .tokenization_utils import (PreTrainedTokenizer)
-from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
-from .tokenization_gpt2 import GPT2Tokenizer
-from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_roberta import RobertaTokenizer
-from .tokenization_distilbert import DistilBertTokenizer
-
-# Configurations
-from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig
-from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-# Modeling
-from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
-from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                            AutoModelWithLMHead)
-
-from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
-                            BertForMaskedLM, BertForNextSentencePrediction,
-                            BertForSequenceClassification, BertForMultipleChoice,
-                            BertForTokenClassification, BertForQuestionAnswering,
-                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                             XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
-                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
-                           XLMWithLMHeadModel, XLMForSequenceClassification,
-                           XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
-                               RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
-                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
-                               DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-
-# Optimization
-from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
-                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
-
-# Files and general utilities
-from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
-                         cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
diff --git a/requirements.txt b/requirements.txt
index 01dca79d23b..9c43abc6d76 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
-# PyTorch
-torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
diff --git a/setup.py b/setup.py
index c31bf6eaeb3..34cb89560e3 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ To create the package for pypi.
    (pypi suggest using twine as other methods upload files via plaintext.)
 
    Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi pytorch-transformers
+   pip install -i https://testpypi.python.org/pypi transformers
 
 6. Upload the final version to actual pypi:
    twine upload dist/* -r pypi
@@ -37,8 +37,8 @@ from io import open
 from setuptools import find_packages, setup
 
 setup(
-    name="pytorch_transformers",
-    version="1.2.0",
+    name="transformers",
+    version="2.0.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
@@ -46,11 +46,10 @@ setup(
     long_description_content_type="text/markdown",
     keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
     license='Apache',
-    url="https://github.com/huggingface/pytorch-transformers",
+    url="https://github.com/huggingface/transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
-    install_requires=['torch>=1.0.0',
-                      'numpy',
+    install_requires=['numpy',
                       'boto3',
                       'requests',
                       'tqdm',
@@ -59,7 +58,7 @@ setup(
                       'sacremoses'],
     entry_points={
       'console_scripts': [
-        "pytorch_transformers=pytorch_transformers.__main__:main",
+        "transformers=transformers.__main__:main",
       ]
     },
     # python_requires='>=3.5.0',
diff --git a/transformers/__init__.py b/transformers/__init__.py
new file mode 100644
index 00000000000..39370cb327a
--- /dev/null
+++ b/transformers/__init__.py
@@ -0,0 +1,164 @@
+__version__ = "2.0.0"
+
+# Work around to update TensorFlow's absl.logging threshold which alters the
+# default Python logging output behavior when present.
+# see: https://github.com/abseil/abseil-py/issues/99
+# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
+try:
+    import absl.logging
+    absl.logging.set_verbosity('info')
+    absl.logging.set_stderrthreshold('info')
+    absl.logging._warn_preinit_stderr = False
+except:
+    pass
+
+import logging
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+# Files and general utilities
+from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+                         cached_path, add_start_docstrings, add_end_docstrings,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         is_tf_available, is_torch_available)
+
+from .data import (is_sklearn_available,
+                   InputExample, InputFeatures, DataProcessor,
+                   glue_output_modes, glue_convert_examples_to_features,
+                   glue_processors, glue_tasks_num_labels)
+
+if is_sklearn_available():
+    from .data import glue_compute_metrics
+
+# Tokenizers
+from .tokenization_utils import (PreTrainedTokenizer)
+from .tokenization_auto import AutoTokenizer
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
+
+# Configurations
+from .configuration_utils import PretrainedConfig
+from .configuration_auto import AutoConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+# Modeling
+if is_torch_available():
+    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
+    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                                AutoModelWithLMHead)
+
+    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
+                                BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForSequenceClassification, BertForMultipleChoice,
+                                BertForTokenClassification, BertForQuestionAnswering,
+                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
+                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
+                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
+                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                                XLNetForSequenceClassification, XLNetForQuestionAnsweringSimple,
+                                XLNetForQuestionAnswering,
+                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
+                            XLMWithLMHeadModel, XLMForSequenceClassification,
+                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
+                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
+                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
+                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    # Optimization
+    from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+                               WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
+
+# TensorFlow
+if is_tf_available():
+    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
+    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
+                                   TFAutoModelWithLMHead)
+
+    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
+                                   TFBertModel, TFBertForPreTraining,
+                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
+                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
+                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
+                                   load_bert_pt_weights_in_tf2,
+                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
+                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
+                                   load_gpt2_pt_weights_in_tf2,
+                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
+                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
+                                     load_openai_gpt_pt_weights_in_tf2,
+                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
+                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
+                                         load_transfo_xl_pt_weights_in_tf2,
+                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
+                                    TFXLNetModel, TFXLNetLMHeadModel,
+                                    TFXLNetForSequenceClassification,
+                                    TFXLNetForQuestionAnsweringSimple,
+                                    load_xlnet_pt_weights_in_tf2,
+                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
+                                  TFXLMModel, TFXLMWithLMHeadModel,
+                                  TFXLMForSequenceClassification,
+                                  TFXLMForQuestionAnsweringSimple,
+                                  load_xlm_pt_weights_in_tf2,
+                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
+                                      TFRobertaModel, TFRobertaForMaskedLM,
+                                      TFRobertaForSequenceClassification,
+                                      load_roberta_pt_weights_in_tf2,
+                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
+                                         TFDistilBertModel, TFDistilBertForMaskedLM,
+                                         TFDistilBertForSequenceClassification,
+                                         TFDistilBertForQuestionAnswering,
+                                         load_distilbert_pt_weights_in_tf2,
+                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+# TF 2.0 <=> PyTorch conversion utilities
+if is_tf_available() and is_torch_available():
+    from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
+                                            load_pytorch_checkpoint_in_tf2_model,
+                                            load_pytorch_weights_in_tf2_model,
+                                            load_pytorch_model_in_tf2_model,
+                                            load_tf2_checkpoint_in_pytorch_model,
+                                            load_tf2_weights_in_pytorch_model,
+                                            load_tf2_model_in_pytorch_model)
+
+if not is_tf_available() and not is_torch_available():
+    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
+                   "Models won't be available and only tokenizers, configuration"
+                   "and file/data utilities can be used.")
diff --git a/pytorch_transformers/__main__.py b/transformers/__main__.py
similarity index 63%
rename from pytorch_transformers/__main__.py
rename to transformers/__main__.py
index b047fa74473..31dbd24908b 100644
--- a/pytorch_transformers/__main__.py
+++ b/transformers/__main__.py
@@ -3,36 +3,37 @@ def main():
     import sys
     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
         print(
-        "Should be used as one of: \n"
-        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
-        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
+        "It should be used as one of: \n"
+        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
     else:
         if sys.argv[1] == "bert":
             try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
                 TF_CHECKPOINT = sys.argv.pop()
                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
         elif sys.argv[1] == "gpt":
-            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -45,15 +46,15 @@ def main():
                                                     PYTORCH_DUMP_OUTPUT)
         elif sys.argv[1] == "transfo_xl":
             try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 if 'ckpt' in sys.argv[2].lower():
                     TF_CHECKPOINT = sys.argv[2]
@@ -69,16 +70,16 @@ def main():
                 convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
         elif sys.argv[1] == "gpt2":
             try:
-                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -89,16 +90,16 @@ def main():
                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
         elif sys.argv[1] == "xlnet":
             try:
-                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 5 or len(sys.argv) > 6:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 TF_CONFIG = sys.argv[3]
@@ -113,11 +114,11 @@ def main():
                                                     PYTORCH_DUMP_OUTPUT,
                                                     FINETUNING_TASK)
         elif sys.argv[1] == "xlm":
-            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
 
             if len(sys.argv) != 4:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
             else:
                 XLM_CHECKPOINT_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
diff --git a/pytorch_transformers/configuration_auto.py b/transformers/configuration_auto.py
similarity index 97%
rename from pytorch_transformers/configuration_auto.py
rename to transformers/configuration_auto.py
index 9e35f85dc74..74dda59fcfa 100644
--- a/pytorch_transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 
 
 class AutoConfig(object):
-    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+    r""":class:`~transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
         when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -76,7 +76,7 @@ class AutoConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
             cache_dir: (`optional`) string:
diff --git a/pytorch_transformers/configuration_bert.py b/transformers/configuration_bert.py
similarity index 96%
rename from pytorch_transformers/configuration_bert.py
rename to transformers/configuration_bert.py
index 7fff3e5d058..122a2c9aab3 100644
--- a/pytorch_transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BertConfig(PretrainedConfig):
     r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
         `BertModel`.
 
 
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
             hidden_dropout_prob: The dropout probabilitiy for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
diff --git a/pytorch_transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
similarity index 98%
rename from pytorch_transformers/configuration_distilbert.py
rename to transformers/configuration_distilbert.py
index b8929eedec7..2a8a149acfc 100644
--- a/pytorch_transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
     def __init__(self,
                  vocab_size_or_config_json_file=30522,
                  max_position_embeddings=512,
-                 sinusoidal_pos_embds=True,
+                 sinusoidal_pos_embds=False,
                  n_layers=6,
                  n_heads=12,
                  dim=768,
diff --git a/pytorch_transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
similarity index 100%
rename from pytorch_transformers/configuration_gpt2.py
rename to transformers/configuration_gpt2.py
diff --git a/pytorch_transformers/configuration_openai.py b/transformers/configuration_openai.py
similarity index 100%
rename from pytorch_transformers/configuration_openai.py
rename to transformers/configuration_openai.py
diff --git a/pytorch_transformers/configuration_roberta.py b/transformers/configuration_roberta.py
similarity index 100%
rename from pytorch_transformers/configuration_roberta.py
rename to transformers/configuration_roberta.py
diff --git a/pytorch_transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
similarity index 80%
rename from pytorch_transformers/configuration_transfo_xl.py
rename to transformers/configuration_transfo_xl.py
index 2e966ee55cf..d55a6adbe69 100644
--- a/pytorch_transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -95,10 +95,43 @@ class TransfoXLConfig(PretrainedConfig):
                  init_range=0.01,
                  proj_init_std=0.01,
                  init_std=0.02,
+                 layer_norm_epsilon=1e-5,
                  **kwargs):
         """Constructs TransfoXLConfig.
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
+        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.cutoffs = []
+        self.cutoffs.extend(cutoffs)
+        self.tie_weight = tie_weight
+        if proj_share_all_but_first:
+            self.tie_projs = [False] + [True] * len(self.cutoffs)
+        else:
+            self.tie_projs = [False] + [False] * len(self.cutoffs)
+        self.d_model = d_model
+        self.d_embed = d_embed
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.div_val = div_val
+        self.pre_lnorm = pre_lnorm
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.tgt_len = tgt_len
+        self.ext_len = ext_len
+        self.mem_len = mem_len
+        self.same_length = same_length
+        self.attn_type = attn_type
+        self.clamp_len = clamp_len
+        self.sample_softmax = sample_softmax
+        self.adaptive = adaptive
+        self.dropout = dropout
+        self.dropatt = dropatt
+        self.untie_r = untie_r
+        self.init = init
+        self.init_range = init_range
+        self.proj_init_std = proj_init_std
+        self.init_std = init_std
+        self.layer_norm_epsilon = layer_norm_epsilon
 
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -106,39 +139,7 @@ class TransfoXLConfig(PretrainedConfig):
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
                 self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.cutoffs = []
-            self.cutoffs.extend(cutoffs)
-            self.tie_weight = tie_weight
-            if proj_share_all_but_first:
-                self.tie_projs = [False] + [True] * len(self.cutoffs)
-            else:
-                self.tie_projs = [False] + [False] * len(self.cutoffs)
-            self.d_model = d_model
-            self.d_embed = d_embed
-            self.d_head = d_head
-            self.d_inner = d_inner
-            self.div_val = div_val
-            self.pre_lnorm = pre_lnorm
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.tgt_len = tgt_len
-            self.ext_len = ext_len
-            self.mem_len = mem_len
-            self.same_length = same_length
-            self.attn_type = attn_type
-            self.clamp_len = clamp_len
-            self.sample_softmax = sample_softmax
-            self.adaptive = adaptive
-            self.dropout = dropout
-            self.dropatt = dropatt
-            self.untie_r = untie_r
-            self.init = init
-            self.init_range = init_range
-            self.proj_init_std = proj_init_std
-            self.init_std = init_std
-        else:
+        elif not isinstance(vocab_size_or_config_json_file, int):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              " or the path to a pretrained model config file (str)")
 
diff --git a/pytorch_transformers/configuration_utils.py b/transformers/configuration_utils.py
similarity index 94%
rename from pytorch_transformers/configuration_utils.py
rename to transformers/configuration_utils.py
index 7efc735d413..8a23be4ff60 100644
--- a/pytorch_transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -54,11 +54,12 @@ class PretrainedConfig(object):
         self.output_attentions = kwargs.pop('output_attentions', False)
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
         self.torchscript = kwargs.pop('torchscript', False)
+        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
         self.pruned_heads = kwargs.pop('pruned_heads', {})
 
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -66,16 +67,17 @@ class PretrainedConfig(object):
         output_config_file = os.path.join(save_directory, CONFIG_NAME)
 
         self.to_json_file(output_config_file)
+        logger.info("Configuration saved in {}".format(output_config_file))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+        r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
 
         Parameters:
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
             cache_dir: (`optional`) string:
@@ -174,7 +176,7 @@ class PretrainedConfig(object):
         """Constructs a `Config` from a Python dictionary of parameters."""
         config = cls(vocab_size_or_config_json_file=-1)
         for key, value in json_object.items():
-            config.__dict__[key] = value
+            setattr(config, key, value)
         return config
 
     @classmethod
diff --git a/pytorch_transformers/configuration_xlm.py b/transformers/configuration_xlm.py
similarity index 98%
rename from pytorch_transformers/configuration_xlm.py
rename to transformers/configuration_xlm.py
index ab251c8939e..fa3a5f40f69 100644
--- a/pytorch_transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
 
         dropout: The dropout probabilitiy for all fully connected
             layers in the embeddings, encoder, and pooler.
-        dropatt: The dropout ratio for the attention
-            probabilities.
         max_position_embeddings: The maximum sequence length that this model might
             ever be used with. Typically set this to something large just in case
             (e.g., 512 or 1024 or 2048).
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
         layer_norm_eps: The epsilon used by LayerNorm.
 
         dropout: float, dropout rate.
-        dropatt: float, dropout rate on attention probabilities.
         init: str, the initialization scheme, either "normal" or "uniform".
         init_range: float, initialize the parameters with a uniform distribution
             in [-init_range, init_range]. Only effective when init="uniform".
diff --git a/pytorch_transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
similarity index 97%
rename from pytorch_transformers/configuration_xlnet.py
rename to transformers/configuration_xlnet.py
index 204d44aa728..0dbf5188499 100644
--- a/pytorch_transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
 
         dropout: The dropout probabilitiy for all fully connected
             layers in the embeddings, encoder, and pooler.
-        dropatt: The dropout ratio for the attention
-            probabilities.
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
         layer_norm_eps: The epsilon used by LayerNorm.
 
         dropout: float, dropout rate.
-        dropatt: float, dropout rate on attention probabilities.
         init: str, the initialization scheme, either "normal" or "uniform".
         init_range: float, initialize the parameters with a uniform distribution
             in [-init_range, init_range]. Only effective when init="uniform".
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
                  n_layer=24,
                  n_head=16,
                  d_inner=4096,
+                 max_position_embeddings=512,
                  ff_activation="gelu",
                  untie_r=True,
                  attn_type="bi",
@@ -112,7 +110,7 @@ class XLNetConfig(PretrainedConfig):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
-                self.__dict__[key] = value
+                setattr(config, key, value)
         elif isinstance(vocab_size_or_config_json_file, int):
             self.n_token = vocab_size_or_config_json_file
             self.d_model = d_model
diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
rename to transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index d382d3588e2..75808811efe 100755
--- a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import argparse
 import torch
 
-from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
similarity index 99%
rename from pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
rename to transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
index 15fd6bf5acf..35866caac48 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers import BertModel
+from transformers import BertModel
 
 
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
rename to transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index eb5b3009b4c..e2328c08ca7 100755
--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
rename to transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index 5eecdd9648c..13ebecf2fd0 100755
--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
new file mode 100644
index 00000000000..c5f7650b507
--- /dev/null
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert pytorch checkpoints to TensorFlow """
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import tensorflow as tf
+
+from transformers import is_torch_available, cached_path
+
+from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+
+if is_torch_available():
+    import torch
+    import numpy as np
+    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
+        None, None, None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None,
+        None, None, None,
+        None, None, None,)
+
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+}
+
+def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
+    if model_type not in MODEL_CLASSES:
+        raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
+
+    config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
+
+    # Initialise TF model
+    if config_file in aws_config_map:
+        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
+    config = config_class.from_json_file(config_file)
+    config.output_hidden_states = True
+    config.output_attentions = True
+    print("Building TensorFlow model from configuration: {}".format(str(config)))
+    tf_model = model_class(config)
+
+    # Load weights from tf checkpoint
+    if pytorch_checkpoint_path in aws_model_maps:
+        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
+    tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
+
+    if compare_with_pt_model:
+        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+        tf_inputs = tf.constant(inputs_list)
+        tfo = tf_model(tf_inputs, training=False)  # build the network
+
+        pt_model = pt_model_class.from_pretrained(None,
+                                                  config=config,
+                                                  state_dict=torch.load(pytorch_checkpoint_path,
+                                                                        map_location='cpu'))
+        pt_inputs = torch.tensor(inputs_list)
+        with torch.no_grad():
+            pto = pt_model(pt_inputs)
+
+        np_pt = pto[0].detach().numpy()
+        np_tf = tfo[0].numpy()
+        diff = np.amax(np.abs(np_pt - np_tf))
+        print("Max absolute difference between models outputs {}".format(diff))
+        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
+
+    # Save pytorch-model
+    print("Save TensorFlow model to {}".format(tf_dump_path))
+    tf_model.save_weights(tf_dump_path, save_format='h5')
+
+
+def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
+                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
+
+    if args_model_type is None:
+        model_types = list(MODEL_CLASSES.keys())
+    else:
+        model_types = [args_model_type]
+
+    for j, model_type in enumerate(model_types, start=1):
+        print("=" * 100)
+        print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
+        print("=" * 100)
+        if model_type not in MODEL_CLASSES:
+            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
+
+        config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
+
+        if model_shortcut_names_or_path is None:
+            model_shortcut_names_or_path = list(aws_model_maps.keys())
+        if config_shortcut_names_or_path is None:
+            config_shortcut_names_or_path = model_shortcut_names_or_path
+
+        for i, (model_shortcut_name, config_shortcut_name) in enumerate(
+                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
+            print("-" * 100)
+            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
+                if not only_convert_finetuned_models:
+                    print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
+                    continue
+                model_type = model_shortcut_name
+            elif only_convert_finetuned_models:
+                print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
+                continue
+            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
+            print("-" * 100)
+
+            if config_shortcut_name in aws_config_map:
+                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
+            else:
+                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
+
+            if model_shortcut_name in aws_model_maps:
+                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
+            else:
+                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
+
+            convert_pt_checkpoint_to_tf(model_type,
+                                        model_file,
+                                        config_file,
+                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
+                                        compare_with_pt_model=compare_with_pt_model)
+            os.remove(config_file)
+            os.remove(model_file)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output Tensorflow dump file.")
+    parser.add_argument("--model_type",
+                        default = None,
+                        type = str,
+                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
+    parser.add_argument("--pytorch_checkpoint_path",
+                        default = None,
+                        type = str,
+                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
+                               "If not given, will download and convert all the checkpoints from AWS.")
+    parser.add_argument("--config_file",
+                        default = None,
+                        type = str,
+                        help = "The config json file corresponding to the pre-trained model. \n"
+                               "This specifies the model architecture. If not given and "
+                               "--pytorch_checkpoint_path is not given or is a shortcut name"
+                               "use the configuration associated to the shortcut name on the AWS")
+    parser.add_argument("--compare_with_pt_model",
+                        action='store_true',
+                        help = "Compare Tensorflow and PyTorch model predictions.")
+    parser.add_argument("--use_cached_models",
+                        action='store_true',
+                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--only_convert_finetuned_models",
+                        action='store_true',
+                        help = "Only convert finetuned models.")
+    args = parser.parse_args()
+
+    # if args.pytorch_checkpoint_path is not None:
+    #     convert_pt_checkpoint_to_tf(args.model_type.lower(),
+    #                                 args.pytorch_checkpoint_path,
+    #                                 args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
+    #                                 args.tf_dump_path,
+    #                                 compare_with_pt_model=args.compare_with_pt_model,
+    #                                 use_cached_models=args.use_cached_models)
+    # else:
+    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
+                                        args.tf_dump_path,
+                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
+                                        compare_with_pt_model=args.compare_with_pt_model,
+                                        use_cached_models=args.use_cached_models,
+                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
rename to transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 9f74254daa8..35f01e9907b 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ import torch
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers import (BertConfig, BertEncoder,
+from transformers import (BertConfig, BertEncoder,
                                                 BertIntermediate, BertLayer,
                                                 BertModel, BertOutput,
                                                 BertSelfAttention,
                                                 BertSelfOutput)
-from pytorch_transformers import (RobertaEmbeddings,
+from transformers import (RobertaEmbeddings,
                                                    RobertaForMaskedLM,
                                                    RobertaForSequenceClassification,
                                                    RobertaModel)
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
similarity index 94%
rename from pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
rename to transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index b310b73453c..a5ff4ed22c1 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ from io import open
 
 import torch
 
-import pytorch_transformers.tokenization_transfo_xl as data_utils
+import transformers.tokenization_transfo_xl as data_utils
 
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+from transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                       load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
similarity index 84%
rename from pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
rename to transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index d6a3cd89e7e..91133ef56af 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 import logging
 logging.basicConfig(level=logging.INFO)
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
     # Load checkpoint
     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
 
-    model = chkpt['model']
+    state_dict = chkpt['model']
+
+    # We have the base model one level deeper than the original XLM repository
+    two_levels_state_dict = {}
+    for k, v in state_dict.items():
+        if 'pred_layer' in k:
+            two_levels_state_dict[k] = v
+        else:
+            two_levels_state_dict['transformer.' + k] = v
 
     config = chkpt['params']
     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
 
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model, pytorch_weights_dump_path)
+    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
 
     print("Save configuration file to {}".format(pytorch_config_dump_path))
     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
rename to transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index a36fa514b59..3669d9944cb 100755
--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
new file mode 100644
index 00000000000..e910d6da2ea
--- /dev/null
+++ b/transformers/data/__init__.py
@@ -0,0 +1,6 @@
+from .processors import InputExample, InputFeatures, DataProcessor
+from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+
+from .metrics import is_sklearn_available
+if is_sklearn_available():
+    from .metrics import glue_compute_metrics
diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
new file mode 100644
index 00000000000..c9ebaac38df
--- /dev/null
+++ b/transformers/data/metrics/__init__.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import sys
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import matthews_corrcoef, f1_score
+    _has_sklearn = True
+except (AttributeError, ImportError) as e:
+    logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
+    _has_sklearn = False
+
+def is_sklearn_available():
+    return _has_sklearn
+
+if _has_sklearn:
+
+    def simple_accuracy(preds, labels):
+        return (preds == labels).mean()
+
+
+    def acc_and_f1(preds, labels):
+        acc = simple_accuracy(preds, labels)
+        f1 = f1_score(y_true=labels, y_pred=preds)
+        return {
+            "acc": acc,
+            "f1": f1,
+            "acc_and_f1": (acc + f1) / 2,
+        }
+
+
+    def pearson_and_spearman(preds, labels):
+        pearson_corr = pearsonr(preds, labels)[0]
+        spearman_corr = spearmanr(preds, labels)[0]
+        return {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+
+
+    def glue_compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "cola":
+            return {"mcc": matthews_corrcoef(labels, preds)}
+        elif task_name == "sst-2":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mrpc":
+            return acc_and_f1(preds, labels)
+        elif task_name == "sts-b":
+            return pearson_and_spearman(preds, labels)
+        elif task_name == "qqp":
+            return acc_and_f1(preds, labels)
+        elif task_name == "mnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mnli-mm":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "qnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "rte":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "wnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        else:
+            raise KeyError(task_name)
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
new file mode 100644
index 00000000000..af38c54beba
--- /dev/null
+++ b/transformers/data/processors/__init__.py
@@ -0,0 +1,3 @@
+from .utils import InputExample, InputFeatures, DataProcessor
+from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
+
diff --git a/examples/utils_glue.py b/transformers/data/processors/glue.py
similarity index 55%
rename from examples/utils_glue.py
rename to transformers/data/processors/glue.py
index 3e3f1046727..3010ce98409 100644
--- a/examples/utils_glue.py
+++ b/transformers/data/processors/glue.py
@@ -13,79 +13,124 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+""" GLUE processors and helpers """
 
-from __future__ import absolute_import, division, print_function
-
-import csv
 import logging
 import os
-import sys
-from io import open
 
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
+from .utils import DataProcessor, InputExample, InputFeatures
+from ...file_utils import is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
 
 logger = logging.getLogger(__name__)
 
+def glue_convert_examples_to_features(examples, tokenizer,
+                                      max_length=512,
+                                      task=None,
+                                      label_list=None,
+                                      output_mode=None,
+                                      pad_on_left=False,
+                                      pad_token=0,
+                                      pad_token_segment_id=0,
+                                      mask_padding_with_zero=True):
+    """
+    Loads a data file into a list of `InputBatch`s
+    """
+    is_tf_dataset = False
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        is_tf_dataset = True
 
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info("Using label list %s for task %s" % (label_list, task))
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info("Using output mode %s for task %s" % (output_mode, task))
 
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
+    label_map = {label: i for i, label in enumerate(label_list)}
 
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+        if is_tf_dataset:
+            example = InputExample(example['idx'].numpy(),
+                                   example['sentence1'].numpy().decode('utf-8'),
+                                   example['sentence2'].numpy().decode('utf-8'),
+                                   str(example['label'].numpy()))
 
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            truncate_first_sequence=True  # We're truncating the first sequence in priority
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
 
-class InputFeatures(object):
-    """A single set of features of data."""
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
 
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
 
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
 
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label))
 
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              attention_mask=attention_mask,
+                              token_type_ids=token_type_ids,
+                              label=label))
 
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
+    if is_tf_available() and is_tf_dataset:
+        def gen():
+            for ex in features:
+                yield  ({'input_ids': ex.input_ids,
+                         'attention_mask': ex.attention_mask,
+                         'token_type_ids': ex.token_type_ids},
+                        ex.label)
 
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
+        return tf.data.Dataset.from_generator(gen,
+            ({'input_ids': tf.int32,
+              'attention_mask': tf.int32,
+              'token_type_ids': tf.int32},
+             tf.int64),
+            ({'input_ids': tf.TensorShape([None]),
+              'attention_mask': tf.TensorShape([None]),
+              'token_type_ids': tf.TensorShape([None])},
+             tf.TensorShape([])))
+
+    return features
 
 
 class MrpcProcessor(DataProcessor):
@@ -302,7 +347,7 @@ class QnliProcessor(DataProcessor):
     def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
             "dev_matched")
 
     def get_labels(self):
@@ -387,198 +432,19 @@ class WnliProcessor(DataProcessor):
                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
 
+glue_tasks_num_labels = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
 
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode,
-                                 cls_token_at_end=False,
-                                 cls_token='[CLS]',
-                                 cls_token_segment_id=1,
-                                 sep_token='[SEP]',
-                                 sep_token_extra=False,
-                                 pad_on_left=False,
-                                 pad_token=0,
-                                 pad_token_segment_id=0,
-                                 sequence_a_segment_id=0, 
-                                 sequence_b_segment_id=1,
-                                 mask_padding_with_zero=True):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-    """
-
-    label_map = {label : i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
-            special_tokens_count = 4 if sep_token_extra else 3
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-            special_tokens_count = 3 if sep_token_extra else 2
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = tokens_a + [sep_token]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + [sep_token]
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-        if cls_token_at_end:
-            tokens = tokens + [cls_token]
-            segment_ids = segment_ids + [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = float(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_mask=input_mask,
-                              segment_ids=segment_ids,
-                              label_id=label_id))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-
-
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-
-
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
-
-processors = {
+glue_processors = {
     "cola": ColaProcessor,
     "mnli": MnliProcessor,
     "mnli-mm": MnliMismatchedProcessor,
@@ -591,7 +457,7 @@ processors = {
     "wnli": WnliProcessor,
 }
 
-output_modes = {
+glue_output_modes = {
     "cola": "classification",
     "mnli": "classification",
     "mnli-mm": "classification",
@@ -603,15 +469,3 @@ output_modes = {
     "rte": "classification",
     "wnli": "classification",
 }
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
new file mode 100644
index 00000000000..a6163720548
--- /dev/null
+++ b/transformers/data/processors/utils.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import sys
+import copy
+import json
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.label = label
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
diff --git a/pytorch_transformers/file_utils.py b/transformers/file_utils.py
similarity index 89%
rename from pytorch_transformers/file_utils.py
rename to transformers/file_utils.py
index 3fe7fa891de..47fdb6e8bac 100644
--- a/pytorch_transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
 import requests
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+try:
+    import tensorflow as tf
+    assert int(tf.__version__[0]) >= 2
+    _tf_available = True  # pylint: disable=invalid-name
+    logger.info("TensorFlow version {} available.".format(tf.__version__))
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
+
+try:
+    import torch
+    _torch_available = True  # pylint: disable=invalid-name
+    logger.info("PyTorch version {} available.".format(torch.__version__))
+except ImportError:
+    _torch_available = False  # pylint: disable=invalid-name
+
+
 try:
     from torch.hub import _get_torch_home
     torch_cache_home = _get_torch_home()
@@ -30,7 +48,7 @@ except ImportError:
     torch_cache_home = os.path.expanduser(
         os.getenv('TORCH_HOME', os.path.join(
             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+default_cache_path = os.path.join(torch_cache_home, 'transformers')
 
 try:
     from urllib.parse import urlparse
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
                                                         default_cache_path))
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
 WEIGHTS_NAME = "pytorch_model.bin"
+TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
 
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def is_torch_available():
+    return _torch_available
+
+def is_tf_available():
+    return _tf_available
 
 if not six.PY2:
     def add_start_docstrings(*docstr):
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
     Convert `url` into a hashed filename in a repeatable way.
     If `etag` is specified, append its hash to the url's, delimited
     by a period.
+    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
+    so that TF 2.0 can identify it as a HDF5 file
+    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
     """
     url_bytes = url.encode('utf-8')
     url_hash = sha256(url_bytes)
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
         etag_hash = sha256(etag_bytes)
         filename += '.' + etag_hash.hexdigest()
 
+    if url.endswith('.h5'):
+        filename += '.h5'
+
     return filename
 
 
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
         force_download: if True, re-dowload the file even if it's already cached in the cache dir.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
         url_or_filename = str(url_or_filename)
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
     If it's not there, download it. Then return the path to the cached file.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
     if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
diff --git a/pytorch_transformers/modeling_auto.py b/transformers/modeling_auto.py
similarity index 90%
rename from pytorch_transformers/modeling_auto.py
rename to transformers/modeling_auto.py
index 31c8fafaa90..b76a883b19c 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 
 class AutoModel(object):
     r"""
-        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        :class:`~transformers.AutoModel` is a generic model class
         that will be instantiated as one of the base model classes of the library
         when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -84,23 +84,23 @@ class AutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -120,7 +120,7 @@ class AutoModel(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -157,7 +157,7 @@ class AutoModel(object):
 
 class AutoModelWithLMHead(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        :class:`~transformers.AutoModelWithLMHead` is a generic model class
         that will be instantiated as one of the language modeling model classes of the library
         when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
 
 class AutoModelForSequenceClassification(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
         that will be instantiated as one of the sequence classification model classes of the library
         when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
 
 class AutoModelForQuestionAnswering(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
         that will be instantiated as one of the question answering model classes of the library
         when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_bert.py b/transformers/modeling_bert.py
similarity index 97%
rename from pytorch_transformers/modeling_bert.py
rename to transformers/modeling_bert.py
index dc3700d26bf..51e407d0a64 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -118,19 +118,24 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 
 
 def gelu(x):
-    """Implementation of the gelu activation function.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initialy created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 def swish(x):
     return x * torch.sigmoid(x)
 
 
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
 
 
 BertLayerNorm = torch.nn.LayerNorm
@@ -191,7 +196,7 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         mixed_query_layer = self.query(hidden_states)
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
@@ -203,8 +208,9 @@ class BertSelfAttention(nn.Module):
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -271,7 +277,7 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, input_tensor, attention_mask, head_mask=None):
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
         self_outputs = self.self(input_tensor, attention_mask, head_mask)
         attention_output = self.output(self_outputs[0], input_tensor)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -314,7 +320,7 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
@@ -330,7 +336,7 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
@@ -476,9 +482,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BERT_INPUTS_DOCSTRING = r"""
@@ -502,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
diff --git a/pytorch_transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
similarity index 99%
rename from pytorch_transformers/modeling_distilbert.py
rename to transformers/modeling_distilbert.py
index c5cc44be750..2425ab5f479 100644
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
         https://medium.com/huggingface/distilbert-8cf3380435b5
 
     Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 DISTILBERT_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
similarity index 97%
rename from pytorch_transformers/modeling_gpt2.py
rename to transformers/modeling_gpt2.py
index 85c09e240f8..bc852240220 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -280,9 +280,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -290,9 +290,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **past**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -367,6 +367,13 @@ class GPT2Model(GPT2PreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -378,6 +385,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
         # Attention mask.
         if attention_mask is not None:
+            attention_mask = attention_mask.view(-1, input_shape[-1])
             # We create a 3D attention mask from a 2D tensor mask.
             # Sizes are [batch_size, 1, 1, to_seq_length]
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
@@ -407,14 +415,9 @@ class GPT2Model(GPT2PreTrainedModel):
         else:
             head_mask = [None] * self.config.n_layer
 
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
-
         inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)
         if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
             token_type_embeds = self.wte(token_type_ids)
         else:
             token_type_embeds = 0
@@ -490,7 +493,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     Examples::
 
         import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+        from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -586,7 +589,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     Examples::
 
         import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
         
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
diff --git a/pytorch_transformers/modeling_openai.py b/transformers/modeling_openai.py
similarity index 98%
rename from pytorch_transformers/modeling_openai.py
rename to transformers/modeling_openai.py
index 4b02baf2f4b..2827bf11e50 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
diff --git a/pytorch_transformers/modeling_roberta.py b/transformers/modeling_roberta.py
similarity index 97%
rename from pytorch_transformers/modeling_roberta.py
rename to transformers/modeling_roberta.py
index 9b30bcd4be3..04ffbecc168 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -77,9 +77,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -102,8 +102,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -361,9 +361,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Segment token indices to indicate first and second portions of the inputs.
             The second dimension of the input (`num_choices`) indicates the number of choices to score.
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
new file mode 100644
index 00000000000..a8f1de047f7
--- /dev/null
+++ b/transformers/modeling_tf_auto.py
@@ -0,0 +1,501 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
+from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
+
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+
+class TFAutoModel(object):
+    r"""
+        :class:`~transformers.TFAutoModel` is a generic model class
+        that will be instantiated as one of the base model classes of the library
+        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
+            - contains `roberta`: TFRobertaModel (RoBERTa model)
+            - contains `bert`: TFBertModel (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetModel (XLNet model)
+            - contains `xlm`: TFXLMModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModel is designed to be instantiated "
+            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
+            - contains `roberta`: TFRobertaModel (RoBERTa model)
+            - contains `bert`: TFTFBertModel (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetModel (XLNet model)
+            - contains `xlm`: TFXLMModel (XLM model)
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = TFAutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class TFAutoModelWithLMHead(object):
+    r"""
+        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: TFBertForMaskedLM (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
+            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: TFBertForMaskedLM (Bert model)
+            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
+            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class TFAutoModelForSequenceClassification(object):
+    r"""
+        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: TFBertForSequenceClassification (Bert model)
+            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: TFBertForSequenceClassification (Bert model)
+            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class TFAutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: TFBertForQuestionAnswering (Bert model)
+            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: TFBertForQuestionAnswering (Bert model)
+            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
+
+            from_pt: (`Optional`) Boolean
+                Set to True if the Checkpoint is a PyTorch checkpoint.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
new file mode 100644
index 00000000000..d763ca991e2
--- /dev/null
+++ b/transformers/modeling_tf_bert.py
@@ -0,0 +1,1044 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 BERT model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_bert import BertConfig
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+
+TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+}
+
+
+def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+
+def gelu_new(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+def swish(x):
+    return x * tf.sigmoid(x)
+
+
+ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
+          "relu": tf.keras.activations.relu,
+          "swish": tf.keras.layers.Activation(swish),
+          "gelu_new": tf.keras.layers.Activation(gelu_new)}
+
+
+class TFBertEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config, **kwargs):
+        super(TFBertEmbeddings, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.initializer_range = config.initializer_range
+
+        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+                                                             config.hidden_size,
+                                                             embeddings_initializer=get_initializer(self.initializer_range),
+                                                             name='position_embeddings')
+        self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
+                                                               config.hidden_size,
+                                                               embeddings_initializer=get_initializer(self.initializer_range),
+                                                               name='token_type_embeddings')
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range))
+        super(TFBertEmbeddings, self).build(input_shape)
+
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+        
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids = inputs
+
+        seq_length = tf.shape(input_ids)[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+
+        words_embeddings = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = tf.shape(inputs)[0]
+        length = tf.shape(inputs)[1]
+
+        x = tf.reshape(inputs, [-1, self.hidden_size])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFBertSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertSelfAttention, self).__init__(**kwargs)
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        assert config.hidden_size % config.num_attention_heads == 0
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = tf.keras.layers.Dense(self.all_head_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='query')
+        self.key = tf.keras.layers.Dense(self.all_head_size,
+                                         kernel_initializer=get_initializer(config.initializer_range),
+                                         name='key')
+        self.value = tf.keras.layers.Dense(self.all_head_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='value')
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        batch_size = tf.shape(hidden_states)[0]
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
+        attention_scores = attention_scores / tf.math.sqrt(dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        context_layer = tf.reshape(context_layer, 
+                                  (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
+
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+
+
+class TFBertSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertSelfOutput, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class TFBertAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertAttention, self).__init__(**kwargs)
+        self.self_attention = TFBertSelfAttention(config, name='self')
+        self.dense_output = TFBertSelfOutput(config, name='output')
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, inputs, training=False):
+        input_tensor, attention_mask, head_mask = inputs
+
+        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
+        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFBertIntermediate(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertIntermediate, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.intermediate_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class TFBertOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertOutput, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class TFBertLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertLayer, self).__init__(**kwargs)
+        self.attention = TFBertAttention(config, name='attention')
+        self.intermediate = TFBertIntermediate(config, name='intermediate')
+        self.bert_output = TFBertOutput(config, name='output')
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFBertEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertEncoder, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # outputs, (hidden states), (attentions)
+
+
+class TFBertPooler(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertPooler, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           activation='tanh',
+                                           name='dense')
+
+    def call(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        return pooled_output
+
+
+class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class TFBertLMPredictionHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFBertLMPredictionHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.transform = TFBertPredictionHeadTransform(config, name='transform')
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFBertLMPredictionHead, self).build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+class TFBertMLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFBertMLMHead, self).__init__(**kwargs)
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
+
+    def call(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class TFBertNSPHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertNSPHead, self).__init__(**kwargs)
+        self.seq_relationship = tf.keras.layers.Dense(2,
+                                                      kernel_initializer=get_initializer(config.initializer_range),
+                                                      name='seq_relationship')
+
+    def call(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class TFBertMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFBertMainLayer, self).__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+
+        self.embeddings = TFBertEmbeddings(config, name='embeddings')
+        self.encoder = TFBertEncoder(config, name='encoder')
+        self.pooler = TFBertPooler(config, name='pooler')
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if attention_mask is None:
+            attention_mask = tf.fill(tf.shape(input_ids), 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+
+
+class TFBertPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = BertConfig
+    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_bert_pt_weights_in_tf2
+    base_model_prefix = "bert"
+
+
+BERT_START_DOCSTRING = r"""    The BERT model was proposed in
+    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertModel(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertModel
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertModel.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertModel, self).__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name='bert')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForPreTraining(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ```tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **seq_relationship_scores**: ```tf.Tensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ```tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ```tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForPreTraining
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        seq_relationship_score = self.nsp(pooled_output)
+
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForMaskedLM(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMaskedLM
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **seq_relationship_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForNextSentencePrediction
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.nsp = TFBertNSPHead(config, name='nsp___cls')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+        seq_relationship_score = self.nsp(pooled_output)
+
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForSequenceClassification(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForSequenceClassification
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForMultipleChoice(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **classification_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMultipleChoice
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
+        outputs = model(input_ids)
+        classification_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(1,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        num_choices = tf.shape(input_ids)[1]
+        seq_length = tf.shape(input_ids)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
+
+        outputs = self.bert(flat_inputs, training=training)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # reshaped_logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForTokenClassification(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForTokenClassification
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class TFBertForQuestionAnswering(TFBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForQuestionAnswering
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name='bert')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
new file mode 100644
index 00000000000..2a917a30a45
--- /dev/null
+++ b/transformers/modeling_tf_distilbert.py
@@ -0,0 +1,745 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 DistilBERT model
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import copy
+import sys
+from io import open
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_distilbert import DistilBertConfig
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+
+TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
+}
+
+
+### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+
+def gelu_new(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+    tf_inputs = [inputs_list, attns_list]
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+class TFEmbeddings(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFEmbeddings, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.dim = config.dim
+        self.initializer_range = config.initializer_range
+        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
+                                                  config.dim,
+                                                  initializer_range=config.initializer_range,
+                                                  name='word_embeddings')  # padding_idx=0)
+        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+                                                             config.dim,
+                                                             embeddings_initializer=get_initializer(config.initializer_range),
+                                                             name='position_embeddings')
+        if config.sinusoidal_pos_embds:
+            raise NotImplementedError
+
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.dim],
+                initializer=get_initializer(self.initializer_range))
+        super(TFEmbeddings, self).build(input_shape)
+
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+        
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        input_ids: tf.Tensor(bs, max_seq_length)
+            The token ids to embed.
+
+        Outputs
+        -------
+        embeddings: tf.Tensor(bs, max_seq_length, dim)
+            The embedded tokens (plus position embeddings, no token_type embeddings)
+        """
+        if not isinstance(inputs, (tuple, list)):
+            input_ids = inputs
+            position_ids = None
+        else:
+            input_ids, position_ids = inputs
+
+        seq_length = tf.shape(input_ids)[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+
+        word_embeddings = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
+
+        embeddings = word_embeddings + position_embeddings            # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
+        return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = tf.shape(inputs)[0]
+        length = tf.shape(inputs)[1]
+
+        x = tf.reshape(inputs, [-1, self.dim])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
+        self.output_attentions = config.output_attentions
+
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="q_lin")
+        self.k_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="k_lin")
+        self.v_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="v_lin")
+        self.out_lin = tf.keras.layers.Dense(config.dim,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name="out_lin")
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        query: tf.Tensor(bs, seq_length, dim)
+        key: tf.Tensor(bs, seq_length, dim)
+        value: tf.Tensor(bs, seq_length, dim)
+        mask: tf.Tensor(bs, seq_length)
+
+        Outputs
+        -------
+        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
+            Attention weights
+        context: tf.Tensor(bs, seq_length, dim)
+            Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        query, key, value, mask, head_mask = inputs
+        bs, q_length, dim = shape_list(query)
+        k_length = shape_list(key)[1]
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        # assert key.size() == value.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        assert 2 <= len(tf.shape(mask)) <= 3
+        causal = (len(tf.shape(mask)) == 3)
+        mask_reshape = [bs, 1, 1, k_length]
+
+        def shape(x):
+            """ separate heads """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """ group heads """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+
+        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
+        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        scores = scores - 1e30 * (1.0 - mask)
+
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)             # (bs, q_length, dim)
+        context = self.out_lin(context)        # (bs, q_length, dim)
+
+        if self.output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+
+class TFFFN(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFFFN, self).__init__(**kwargs)
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
+                                          kernel_initializer=get_initializer(config.initializer_range),
+                                          name="lin1")
+        self.lin2 = tf.keras.layers.Dense(config.dim,
+                                          kernel_initializer=get_initializer(config.initializer_range),
+                                          name="lin2")
+        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
+        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+        return x
+
+
+class TFTransformerBlock(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFTransformerBlock, self).__init__(**kwargs)
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.hidden_dim = config.hidden_dim
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.activation = config.activation
+        self.output_attentions = config.output_attentions
+
+        assert config.dim % config.n_heads == 0
+
+        self.attention = TFMultiHeadSelfAttention(config, name="attention")
+        self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
+
+        self.ffn = TFFFN(config, name="ffn")
+        self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
+
+    def call(self, inputs, training=False):  # removed: src_enc=None, src_len=None
+        """
+        Parameters
+        ----------
+        x: tf.Tensor(bs, seq_length, dim)
+        attn_mask: tf.Tensor(bs, seq_length)
+
+        Outputs
+        -------
+        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
+            The attention weights
+        ffn_output: tf.Tensor(bs, seq_length, dim)
+            The output of the transformer block contextualization.
+        """
+        x, attn_mask, head_mask = inputs
+
+        # Self-Attention
+        sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
+        if self.output_attentions:
+            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            # assert type(sa_output) == tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+
+        output = (ffn_output,)
+        if self.output_attentions:
+            output = (sa_weights,) + output
+        return output
+
+
+class TFTransformer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFTransformer, self).__init__(**kwargs)
+        self.n_layers = config.n_layers
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
+                      for i in range(config.n_layers)]
+
+    def call(self, inputs, training=False):
+        """
+        Parameters
+        ----------
+        x: tf.Tensor(bs, seq_length, dim)
+            Input sequence embedded.
+        attn_mask: tf.Tensor(bs, seq_length)
+            Attention mask on the sequence.
+
+        Outputs
+        -------
+        hidden_state: tf.Tensor(bs, seq_length, dim)
+            Sequence of hiddens states in the last (top) layer
+        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
+            Tuple of length n_layers with the hidden states from each layer.
+            Optional: only if output_hidden_states=True
+        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
+            Tuple of length n_layers with the attention weights from each layer
+            Optional: only if output_attentions=True
+        """
+        x, attn_mask, head_mask = inputs
+
+        all_hidden_states = ()
+        all_attentions = ()
+
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+            layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
+            hidden_state = layer_outputs[-1]
+
+            if self.output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+
+        outputs = (hidden_state,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+class TFDistilBertMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFDistilBertMainLayer, self).__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+
+        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
+        self.transformer = TFTransformer(config, name="transformer") # Encoder
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def call(self, inputs, attention_mask=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            head_mask = inputs[2] if len(inputs) > 2 else head_mask
+            assert len(inputs) <= 3, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 3, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if attention_mask is None:
+            attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
+        tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
+
+        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
+
+
+### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+class TFDistilBertPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = DistilBertConfig
+    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_distilbert_pt_weights_in_tf2
+    base_model_prefix = "distilbert"
+
+
+DISTILBERT_START_DOCSTRING = r"""
+    DistilBERT is a small, fast, cheap and light Transformer model
+    trained by distilling Bert base. It has 40% less parameters than
+    `bert-base-uncased`, runs 60% faster while preserving over 95% of
+    Bert's performances as measured on the GLUE language understanding benchmark.
+
+    Here are the differences between the interface of Bert and DistilBert:
+
+    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
+    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+
+    For more information on DistilBERT, please refer to our
+    `detailed blog post`_
+    
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`detailed blog post`:
+        https://medium.com/huggingface/distilbert-8cf3380435b5
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
+            
+            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertModel(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertModel
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
+
+    def call(self, inputs, **kwargs):
+        outputs = self.distilbert(inputs, **kwargs)
+        return outputs
+
+
+class TFDistilBertLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFDistilBertLMHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFDistilBertLMHead, self).build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.vocab_size = config.vocab_size
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.vocab_transform = tf.keras.layers.Dense(config.dim,
+                                                     kernel_initializer=get_initializer(config.initializer_range),
+                                                     name="vocab_transform")
+        self.act = tf.keras.layers.Activation(gelu)
+        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
+        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
+
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+
+        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
+        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)
+
+        outputs = (prediction_logits,) + distilbert_output[1:]
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+                         the pooled output) e.g. for GLUE tasks. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFDistilBertForSequenceClassification
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.pre_classifier = tf.keras.layers.Dense(config.dim,
+                                                    kernel_initializer=get_initializer(config.initializer_range),
+                                                    activation='relu',
+                                                    name="pre_classifier")
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name="classifier")
+        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
+
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+
+        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]                    # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
+        logits = self.classifier(pooled_output)              # (bs, dim)
+
+        outputs = (logits,) + distilbert_output[1:]
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+                         the hidden-states output to compute `span start logits` and `span end logits`). """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        start_positions = tf.constant([1])
+        end_positions = tf.constant([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+        assert config.num_labels == 2
+        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
+
+    def call(self, inputs, **kwargs):
+        distilbert_output = self.distilbert(inputs, **kwargs)
+
+        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + distilbert_output[1:]
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
new file mode 100644
index 00000000000..e958c2cbf17
--- /dev/null
+++ b/transformers/modeling_tf_gpt2.py
@@ -0,0 +1,613 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 OpenAI GPT-2 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
+                                TFSequenceSummary, shape_list, get_initializer)
+from .configuration_gpt2 import GPT2Config
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
+
+
+def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+def gelu(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+
+class TFAttention(tf.keras.layers.Layer):
+    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+        super(TFAttention, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.n_ctx = n_ctx
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        pass
+
+    @staticmethod
+    def causal_attention_mask(nd, ns, dtype):
+        """1's in the lower triangle, counting from the lower right corner.
+        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
+        """
+        i = tf.range(nd)[:,None]
+        j = tf.range(ns)
+        m = i >= j - ns + nd
+        return tf.cast(m, dtype)
+
+    def _attn(self, inputs, training=False):
+        q, k, v, attention_mask, head_mask = inputs
+        # q, k, v have shape [batch, heads, sequence, features]
+        w = tf.matmul(q, k, transpose_b=True)
+        if self.scale:
+            dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
+            w = w / tf.math.sqrt(dk)
+
+        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+        _, _, nd, ns = shape_list(w)
+        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
+        b = tf.reshape(b, [1, 1, nd, ns])
+        w = w * b - 1e4 * (1 - b)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+
+        w = tf.nn.softmax(w, axis=-1)
+        w = self.attn_dropout(w, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [tf.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = tf.transpose(x, [0, 2, 1, 3])
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
+        return tf.reshape(x, new_x_shape)
+
+    def split_heads(self, x):
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+
+    def call(self, inputs, training=False):
+        x, layer_past, attention_mask, head_mask = inputs
+
+        x = self.c_attn(x)
+        query, key, value = tf.split(x, 3, axis=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = tf.unstack(layer_past, axis=1)
+            key = tf.concat([past_key, key], axis=-2)
+            value = tf.concat([past_value, value], axis=-2)
+        present = tf.stack([key, value], axis=1)
+
+        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a, training=training)
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+
+
+class TFMLP(tf.keras.layers.Layer):
+    def __init__(self, n_state, config, **kwargs):
+        super(TFMLP, self).__init__(**kwargs)
+        nx = config.n_embd
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.act = gelu
+        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+
+    def call(self, x, training=False):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        h2 = self.dropout(h2, training=training)
+        return h2
+
+
+class TFBlock(tf.keras.layers.Layer):
+    def __init__(self, n_ctx, config, scale=False, **kwargs):
+        super(TFBlock, self).__init__(**kwargs)
+        nx = config.n_embd
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
+        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
+        self.mlp = TFMLP(4 * nx, config, name='mlp')
+
+    def call(self, inputs, training=False):
+        x, layer_past, attention_mask, head_mask = inputs
+
+        a = self.ln_1(x)
+        output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+        x = x + a
+
+        m = self.ln_2(x)
+        m = self.mlp(m, training=training)
+        x = x + m
+
+        outputs = [x] + output_attn[1:]
+        return outputs  # x, present, (attentions)
+
+
+class TFGPT2MainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.num_hidden_layers = config.n_layer
+        self.vocab_size = config.vocab_size
+        self.n_embd = config.n_embd
+
+        self.wte = TFSharedEmbeddings(config.vocab_size,
+                                      config.hidden_size,
+                                      initializer_range=config.initializer_range,
+                                      name='wte')
+        self.wpe = tf.keras.layers.Embedding(config.n_positions,
+                                             config.n_embd,
+                                             embeddings_initializer=get_initializer(config.initializer_range),
+                                             name='wpe')
+        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
+        self.h = [TFBlock(config.n_ctx,
+                          config,
+                          scale=True,
+                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
+        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            past = inputs[1] if len(inputs) > 1 else past
+            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
+            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
+            position_ids = inputs[4] if len(inputs) > 4 else position_ids
+            head_mask = inputs[5] if len(inputs) > 5 else head_mask
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            past = inputs.get('past', past)
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = shape_list(past[0][0])[-2]
+        if position_ids is None:
+            position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
+
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+
+            attention_mask = tf.cast(attention_mask, tf.float32)
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        else:
+            attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        input_shape = shape_list(input_ids)
+        input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        inputs_embeds = self.wte(input_ids, mode='embedding')
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.wte(token_type_ids, mode='embedding')
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+
+        presents = ()
+        all_attentions = []
+        all_hidden_states = ()
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+
+            outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
+
+            hidden_states, present = outputs[:2]
+            presents = presents + (present,)
+
+            if self.output_attentions:
+                all_attentions.append(outputs[2])
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states, presents)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
+
+
+class TFGPT2PreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = GPT2Config
+    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_gpt2_pt_weights_in_tf2
+    base_model_prefix = "transformer"
+
+
+GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
+    `Language Models are Unsupervised Multitask Learners`_
+    by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+    It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
+    corpus of ~40 GB of text data.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Language Models are Unsupervised Multitask Learners`:
+        https://openai.com/blog/better-language-models/
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **past**:
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class TFGPT2Model(TFGPT2PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **past**:
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import GPT2Tokenizer, TFGPT2Model
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = TFGPT2Model.from_pretrained('gpt2')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **past**:
+            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
+
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.transformer.wte(hidden_states, mode="linear")
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+
+        return outputs  # lm_logits, presents, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the input sequence).
+""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
+    r"""
+        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **past**:
+            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+        
+        # Add a [CLS] to the vocabulary (we should train it also!)
+        # This option is currently not implemented in TF 2.0
+        raise NotImplementedError
+        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        encoded_choices = [tokenizer.encode(s) for s in choices]
+        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+
+        outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name='transformer')
+        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+
+    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            past = inputs[1] if len(inputs) > 1 else past
+            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
+            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
+            position_ids = inputs[4] if len(inputs) > 4 else position_ids
+            head_mask = inputs[5] if len(inputs) > 5 else head_mask
+            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
+            assert len(inputs) <= 7, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            past = inputs.get('past', past)
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            assert len(inputs) <= 7, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        input_shapes = shape_list(input_ids)
+
+        seq_length = input_shapes[-1]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
+
+        transformer_outputs = self.transformer(flat_inputs, training=training)
+        hidden_states = transformer_outputs[0]
+
+        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
+
+        lm_logits = self.transformer.wte(hidden_states, mode="linear")
+        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
+
+        mc_logits = tf.squeeze(mc_logits, axis=-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+
+        return outputs  # lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
new file mode 100644
index 00000000000..7521866c246
--- /dev/null
+++ b/transformers/modeling_tf_openai.py
@@ -0,0 +1,576 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 OpenAI GPT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
+                                TFSequenceSummary, shape_list, get_initializer)
+from .configuration_openai import OpenAIGPTConfig
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
+
+
+def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+def gelu(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+
+def swish(x):
+    return x * tf.math.sigmoid(x)
+
+
+ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
+           "relu": tf.keras.activations.relu,
+           "swish": tf.keras.layers.Activation(swish)}
+
+
+class TFAttention(tf.keras.layers.Layer):
+    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
+        super(TFAttention, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.n_ctx = n_ctx
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
+        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        pass
+
+    @staticmethod
+    def causal_attention_mask(nd, ns, dtype):
+        """1's in the lower triangle, counting from the lower right corner.
+        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
+        """
+        i = tf.range(nd)[:,None]
+        j = tf.range(ns)
+        m = i >= j - ns + nd
+        return tf.cast(m, dtype)
+
+    def _attn(self, inputs, training=False):
+        q, k, v, attention_mask, head_mask = inputs
+        # q, k, v have shape [batch, heads, sequence, features]
+        w = tf.matmul(q, k, transpose_b=True)
+        if self.scale:
+            dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
+            w = w / tf.math.sqrt(dk)
+
+        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+        _, _, nd, ns = shape_list(w)
+        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
+        b = tf.reshape(b, [1, 1, nd, ns])
+        w = w * b - 1e4 * (1 - b)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            w = w + attention_mask
+
+        w = tf.nn.softmax(w, axis=-1)
+        w = self.attn_dropout(w, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [tf.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = tf.transpose(x, [0, 2, 1, 3])
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
+        return tf.reshape(x, new_x_shape)
+
+    def split_heads(self, x):
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+
+    def call(self, inputs, training=False):
+        x, attention_mask, head_mask = inputs
+
+        x = self.c_attn(x)
+        query, key, value = tf.split(x, 3, axis=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key)
+        value = self.split_heads(value)
+
+        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a, training=training)
+
+        outputs = [a] + attn_outputs[1:]
+        return outputs  # a, (attentions)
+
+
+class TFMLP(tf.keras.layers.Layer):
+    def __init__(self, n_state, config, **kwargs):
+        super(TFMLP, self).__init__(**kwargs)
+        nx = config.n_embd
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
+        self.act = gelu
+        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+
+    def call(self, x, training=False):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        h2 = self.dropout(h2, training=training)
+        return h2
+
+
+class TFBlock(tf.keras.layers.Layer):
+    def __init__(self, n_ctx, config, scale=False, **kwargs):
+        super(TFBlock, self).__init__(**kwargs)
+        nx = config.n_embd
+        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
+        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
+        self.mlp = TFMLP(4 * nx, config, name='mlp')
+        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
+
+    def call(self, inputs, training=False):
+        x, attention_mask, head_mask = inputs
+
+        output_attn = self.attn([x, attention_mask, head_mask], training=training)
+        a = output_attn[0]  # output_attn: a, (attentions)
+
+        n = self.ln_1(x + a)
+        m = self.mlp(n, training=training)
+        h = self.ln_2(n + m)
+
+        outputs = [h] + output_attn[1:]
+        return outputs  # x, (attentions)
+
+
+class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs)
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.num_hidden_layers = config.n_layer
+        self.vocab_size = config.vocab_size
+        self.n_embd = config.n_embd
+
+        self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
+                                               config.n_embd,
+                                               initializer_range=config.initializer_range,
+                                               name='tokens_embed')
+        self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
+                                                         config.n_embd,
+                                                         embeddings_initializer=get_initializer(config.initializer_range),
+                                                         name='positions_embed')
+        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
+        self.h = [TFBlock(config.n_ctx,
+                          config,
+                          scale=True,
+                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if position_ids is None:
+            position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :]
+
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+
+            attention_mask = tf.cast(attention_mask, tf.float32)
+            attention_mask = (1.0 - attention_mask) * -10000.0
+        else:
+            attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        input_shape = shape_list(input_ids)
+        input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
+        position_embeds = self.positions_embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+
+        all_attentions = []
+        all_hidden_states = ()
+        for i, block in enumerate(self.h):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+
+            outputs = block([hidden_states, attention_mask, head_mask[i]], training=training)
+            hidden_states = outputs[0]
+            if self.output_attentions:
+                all_attentions.append(outputs[1])
+
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, (all hidden_states), (attentions)
+
+
+class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = OpenAIGPTConfig
+    pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_openai_gpt_pt_weights_in_tf2
+    base_model_prefix = "transformer"
+
+
+OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
+    `Improving Language Understanding by Generative Pre-Training`_
+    by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+    It's a causal (unidirectional) transformer pre-trained using language modeling on a large
+    corpus will long range dependencies, the Toronto Book Corpus.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Improving Language Understanding by Generative Pre-Training`:
+        https://openai.com/blog/language-unsupervised/
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
+        **position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+
+        return outputs  # lm_logits, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the input sequence).
+""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
+    r"""
+        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        
+        # Add a [CLS] to the vocabulary (we should train it also!)
+        # This option is currently not implemented in TF 2.0
+        raise NotImplementedError
+        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
+        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
+        outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
+        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        input_shapes = shape_list(input_ids)
+
+        seq_length = input_shapes[-1]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
+
+        transformer_outputs = self.transformer(flat_inputs, training=training)
+        hidden_states = transformer_outputs[0]
+
+        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
+
+        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
+        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
+
+        mc_logits = tf.squeeze(mc_logits, axis=-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+
+        return outputs  # lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
new file mode 100644
index 00000000000..66caa95ec78
--- /dev/null
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch - TF 2.0 general utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+import re
+import numpy
+
+logger = logging.getLogger(__name__)
+
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+
+def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
+    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
+
+        Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
+            - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+            - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+
+        return tuple with:
+            - pytorch model weight name
+            - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
+    """
+    tf_name = tf_name.replace(':0', '')                       # device ids
+    tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name)  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
+    tf_name = tf_name.replace('_._', '/')                     # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
+    tf_name = re.sub(r'//+', '/', tf_name)                    # Remove empty levels at the end
+    tf_name = tf_name.split('/')                              # Convert from TF2.0 '/' separators to PyTorch '.' separators
+    tf_name = tf_name[1:]                                     # Remove level zero
+
+    # When should we transpose the weights
+    transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
+
+    # Convert standard TF2.0 names in PyTorch names
+    if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
+        tf_name[-1] = 'weight'
+    if tf_name[-1] == 'beta':
+        tf_name[-1] = 'bias'
+
+    # Remove prefix if needed
+    tf_name = '.'.join(tf_name)
+    if start_prefix_to_remove:
+        tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
+
+    return tf_name, transpose
+
+
+#####################
+### PyTorch => TF 2.0
+
+def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
+    """ Load pytorch checkpoints in a TF 2.0 model
+    """
+    try:
+        import tensorflow as tf
+        import torch
+    except ImportError as e:
+        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        raise e
+
+    pt_path = os.path.abspath(pytorch_checkpoint_path)
+    logger.info("Loading PyTorch weights from {}".format(pt_path))
+
+    pt_state_dict = torch.load(pt_path, map_location='cpu')
+
+    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+
+
+def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
+    """ Load pytorch checkpoints in a TF 2.0 model
+    """
+    pt_state_dict = pt_model.state_dict()
+
+    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
+
+
+def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
+    """ Load pytorch state_dict in a TF 2.0 model.
+    """
+    try:
+        import torch
+        import tensorflow as tf
+        from tensorflow.python.keras import backend as K
+    except ImportError as e:
+        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        raise e
+
+    if tf_inputs is None:
+        tf_inputs = tf.constant(DUMMY_INPUTS)
+
+    if tf_inputs is not None:
+        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
+
+    # Adapt state dict - TODO remove this and update the AWS weights files instead
+    # Convert old format to new format if needed from a PyTorch state_dict
+    old_keys = []
+    new_keys = []
+    for key in pt_state_dict.keys():
+        new_key = None
+        if 'gamma' in key:
+            new_key = key.replace('gamma', 'weight')
+        if 'beta' in key:
+            new_key = key.replace('beta', 'bias')
+        if new_key:
+            old_keys.append(key)
+            new_keys.append(new_key)
+    for old_key, new_key in zip(old_keys, new_keys):
+        pt_state_dict[new_key] = pt_state_dict.pop(old_key)
+
+    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
+    # TF models always have a prefix, some of PyTorch models (base ones) don't
+    start_prefix_to_remove = ''
+    if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
+        start_prefix_to_remove = tf_model.base_model_prefix + '.'
+
+    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
+
+    weight_value_tuples = []
+    all_pytorch_weights = set(list(pt_state_dict.keys()))
+    for symbolic_weight in symbolic_weights:
+        sw_name = symbolic_weight.name
+        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
+
+        # Find associated numpy array in pytorch model state dict
+        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
+        array = pt_state_dict[name].numpy()
+
+        if transpose:
+            array = numpy.transpose(array)
+
+        if len(symbolic_weight.shape) < len(array.shape):
+            array = numpy.squeeze(array)
+        elif len(symbolic_weight.shape) > len(array.shape):
+            array = numpy.expand_dims(array, axis=0)
+
+        try:
+            assert list(symbolic_weight.shape) == list(array.shape)
+        except AssertionError as e:
+            e.args += (symbolic_weight.shape, array.shape)
+            raise e
+
+        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
+
+        weight_value_tuples.append((symbolic_weight, array))
+        all_pytorch_weights.discard(name)
+
+    K.batch_set_value(weight_value_tuples)
+
+    if tf_inputs is not None:
+        tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
+
+    logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
+
+    return tf_model
+
+
+#####################
+### TF 2.0 => PyTorch
+
+def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
+    """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
+        We use HDF5 to easily do transfer learning
+        (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
+    """
+    try:
+        import tensorflow as tf
+        import torch
+    except ImportError as e:
+        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        raise e
+
+    import transformers
+
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
+
+    # Instantiate and load the associated TF 2.0 model
+    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
+    tf_model_class = getattr(transformers, tf_model_class_name)
+    tf_model = tf_model_class(pt_model.config)
+
+    if tf_inputs is None:
+        tf_inputs = tf.constant(DUMMY_INPUTS)
+
+    if tf_inputs is not None:
+        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
+
+    tf_model.load_weights(tf_checkpoint_path, by_name=True)
+
+    return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
+
+def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
+    """ Load TF 2.0 model in a pytorch model
+    """
+    weights = tf_model.weights
+
+    return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
+
+
+def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
+    """ Load TF2.0 symbolic weights in a PyTorch model
+    """
+    try:
+        import tensorflow as tf
+        import torch
+    except ImportError as e:
+        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+        raise e
+
+    new_pt_params_dict = {}
+    current_pt_params_dict = dict(pt_model.named_parameters())
+
+    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
+    # TF models always have a prefix, some of PyTorch models (base ones) don't
+    start_prefix_to_remove = ''
+    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
+        start_prefix_to_remove = pt_model.base_model_prefix + '.'
+
+    # Build a map from potential PyTorch weight names to TF 2.0 Variables
+    tf_weights_map = {}
+    for tf_weight in tf_weights:
+        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
+        tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
+
+    all_tf_weights = set(list(tf_weights_map.keys()))
+    loaded_pt_weights_data_ptr = {}
+    for pt_weight_name, pt_weight in current_pt_params_dict.items():
+        # Handle PyTorch shared weight ()not duplicated in TF 2.0
+        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
+            new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
+            continue
+
+        # Find associated numpy array in pytorch model state dict
+        if pt_weight_name not in tf_weights_map:
+            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
+
+        array, transpose = tf_weights_map[pt_weight_name]
+
+        if transpose:
+            array = numpy.transpose(array)
+
+        if len(pt_weight.shape) < len(array.shape):
+            array = numpy.squeeze(array)
+        elif len(pt_weight.shape) > len(array.shape):
+            array = numpy.expand_dims(array, axis=0)
+
+        try:
+            assert list(pt_weight.shape) == list(array.shape)
+        except AssertionError as e:
+            e.args += (pt_weight.shape, array.shape)
+            raise e
+
+        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
+
+        new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
+        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
+        all_tf_weights.discard(pt_weight_name)
+
+    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
+
+    if len(missing_keys) > 0:
+        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
+            pt_model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        logger.info("Weights from TF 2.0 model not used in {}: {}".format(
+            pt_model.__class__.__name__, unexpected_keys))
+
+    logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
+
+    return pt_model
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
new file mode 100644
index 00000000000..43747133ffe
--- /dev/null
+++ b/transformers/modeling_tf_roberta.py
@@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_roberta import RobertaConfig
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
+
+logger = logging.getLogger(__name__)
+
+TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
+}
+
+def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+class TFRobertaEmbeddings(TFBertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+    def __init__(self, config, **kwargs):
+        super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
+        self.padding_idx = 1
+
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids = inputs
+
+        seq_length = tf.shape(input_ids)[1]
+        if position_ids is None:
+            position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
+
+        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training)
+
+
+class TFRobertaMainLayer(TFBertMainLayer):
+    """
+    Same as TFBertMainLayer but uses TFRobertaEmbeddings.
+    """
+    def __init__(self, config, **kwargs):
+        super(TFRobertaMainLayer, self).__init__(config, **kwargs)
+        self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
+
+    def call(self, inputs, **kwargs):
+        # Check that input_ids starts with control token
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+        else:
+            input_ids = inputs
+
+        if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
+            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
+                           "This model requires special tokens in order to work. "
+                           "Please specify add_special_tokens=True in your encoding.")
+
+        return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
+
+
+class TFRobertaPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_roberta_pt_weights_in_tf2
+    base_model_prefix = "roberta"
+
+
+ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
+    `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
+    by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
+    Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+    
+    It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
+    objective and training with much larger mini-batches and learning rates.
+    
+    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
+    models.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
+        https://arxiv.org/abs/1907.11692
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+
+            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class TFRobertaModel(TFRobertaPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaModel
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaModel.from_pretrained('roberta-base')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
+        self.roberta = TFRobertaMainLayer(config, name='roberta')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.roberta(inputs, **kwargs)
+        return outputs
+
+
+class TFRobertaLMHead(tf.keras.layers.Layer):
+    """Roberta Head for masked language modeling."""
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFRobertaLMHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           name='dense')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.act = tf.keras.layers.Activation(gelu)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFRobertaLMHead, self).build(input_shape)
+
+    def call(self, features):
+        x = self.dense(features)
+        x = self.act(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x, mode="linear") + self.bias
+
+        return x
+
+
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
+        self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
+
+    def call(self, inputs, **kwargs):
+        outputs = self.roberta(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+
+
+class TFRobertaClassificationHead(tf.keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
+        self.dense = tf.keras.layers.Dense(config.hidden_size,
+                                           kernel_initializer=get_initializer(config.initializer_range),
+                                           activation='tanh',
+                                           name="dense")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.out_proj = tf.keras.layers.Dense(config.num_labels,
+                                              kernel_initializer=get_initializer(config.initializer_range),
+                                              name="out_proj")
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
+
+        tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.constant([1])[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
+        self.classifier = TFRobertaClassificationHead(config, name="classifier")
+    
+    def call(self, inputs, **kwargs):
+        outputs = self.roberta(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, training=kwargs.get('training', False))
+
+        outputs = (logits,) + outputs[2:]
+
+        return outputs  # logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
new file mode 100644
index 00000000000..df8c7e7dc94
--- /dev/null
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -0,0 +1,763 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 Transformer XL model.
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import json
+import math
+import logging
+import collections
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
+from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
+}
+
+def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+class TFPositionalEmbedding(tf.keras.layers.Layer):
+    def __init__(self, demb, **kwargs):
+        super(TFPositionalEmbedding, self).__init__(**kwargs)
+
+        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
+
+    def call(self, pos_seq, bsz=None):
+        sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
+        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+
+        if bsz is not None:
+            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
+        else:
+            return pos_emb[:, None, :]
+
+
+class TFPositionwiseFF(tf.keras.layers.Layer):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+        super(TFPositionwiseFF, self).__init__(**kwargs)
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.layer_1 = tf.keras.layers.Dense(d_inner,
+                                             kernel_initializer=get_initializer(init_std),
+                                             activation=tf.nn.relu,
+                                             name='CoreNet_._0')
+        self.drop_1 = tf.keras.layers.Dropout(dropout)
+        self.layer_2 = tf.keras.layers.Dense(d_model,
+                                             kernel_initializer=get_initializer(init_std),
+                                             name='CoreNet_._3')
+        self.drop_2 = tf.keras.layers.Dropout(dropout)
+
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+
+        self.pre_lnorm = pre_lnorm
+
+    def call(self, inp, training=False):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.layer_norm(inp)
+            core_out = self.layer_1(core_out)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.layer_1(inp)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
+                 r_r_bias=None, r_w_bias=None, output_attentions=False, 
+                 layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+        super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
+
+        self.output_attentions = output_attentions
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
+                                             kernel_initializer=get_initializer(init_std),
+                                             use_bias=False,
+                                             name='qkv_net')
+
+        self.drop = tf.keras.layers.Dropout(dropout)
+        self.dropatt = tf.keras.layers.Dropout(dropatt)
+        self.o_net = tf.keras.layers.Dense(d_model,
+                                           kernel_initializer=get_initializer(init_std),
+                                           use_bias=False,
+                                           name='o_net')
+
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is not None and r_w_bias is not None: # Biases are shared
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+        else:
+            self.r_r_bias = None
+            self.r_w_bias = None
+
+        self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
+                                           kernel_initializer=get_initializer(init_std),
+                                           use_bias=False,
+                                           name='r_net')
+
+    def build(self, input_shape):
+        if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
+            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                            initializer='zeros',
+                                            trainable=True,
+                                            name='r_r_bias')
+            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                            initializer='zeros',
+                                            trainable=True,
+                                            name='r_w_bias')
+        super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
+
+    def _rel_shift(self, x):
+        x_size = shape_list(x)
+
+        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
+        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
+        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+        x = tf.reshape(x, x_size)
+
+        return x
+
+    def call(self, inputs, training=False):
+        w, r, attn_mask, mems, head_mask = inputs
+        qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
+
+        if mems is not None:
+            cat = tf.concat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+
+        klen = shape_list(w_head_k)[0]
+
+        w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+
+        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))       # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
+        AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)                  # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)                   # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score = attn_score * self.scale
+
+        #### compute attention probability
+        if attn_mask is not None:
+            attn_mask_t = attn_mask[:, :, None, None]
+            attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = tf.nn.softmax(attn_score, axis=1)
+        attn_prob = self.dropatt(attn_prob, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        #### compute attention vector
+        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec_sizes = shape_list(attn_vec)
+        attn_vec = tf.reshape(attn_vec, 
+                        (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out, training=training)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            outputs = [w + attn_out]
+        else:
+            ##### residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 tgt_len=None, ext_len=None, mem_len=None,
+                 dropatt=0., pre_lnorm=False,
+                 r_w_bias=None,
+                 r_r_bias=None,
+                 output_attentions=False,
+                 layer_norm_epsilon=1e-5,
+                 init_std=0.02,
+                 **kwargs):
+        super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
+
+        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
+                            mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
+                            r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
+                            output_attentions=output_attentions,
+                            layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
+        self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, 
+                                       pre_lnorm=pre_lnorm, init_std=init_std,
+                                       layer_norm_epsilon=layer_norm_epsilon,
+                                       name='pos_ff')
+
+    def call(self, inputs, training=False):
+        dec_inp, r, dec_attn_mask, mems, head_mask = inputs
+        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
+                                      mems, head_mask], training=training)
+        ff_output = self.pos_ff(attn_outputs[0], training=training)
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+class TFAdaptiveEmbedding(tf.keras.layers.Layer):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
+                 sample_softmax=False, **kwargs):
+        super(TFAdaptiveEmbedding, self).__init__(**kwargs)
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.init_std = init_std
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = []
+        self.emb_projs = []
+        if div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
+                                                                 d_emb_i,
+                                                                 embeddings_initializer=get_initializer(init_std),
+                                                                 name='emb_layers_._{}'.format(i)))
+
+    def build(self, input_shape):
+        for i in range(len(self.cutoffs)):
+            d_emb_i = self.d_embed // (self.div_val ** i)
+            self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
+                                                  initializer=get_initializer(self.init_std),
+                                                  trainable=True,
+                                                  name='emb_projs_._{}'.format(i)))
+        super(TFAdaptiveEmbedding, self).build(input_shape)
+
+    def call(self, inp):
+        if self.div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            inp_flat = tf.reshape(inp, (-1,))
+            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+
+                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
+
+                mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
+                emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
+
+            embed_shape = shape_list(inp) + [self.d_proj]
+            embed = tf.reshape(emb_flat, embed_shape)
+
+        embed *= self.emb_scale
+
+        return embed
+
+
+class TFTransfoXLMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFTransfoXLMainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.n_token = config.n_token
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.untie_r = config.untie_r
+
+        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')
+
+        self.drop = tf.keras.layers.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+
+        self.tgt_len = config.tgt_len
+        self.mem_len = config.mem_len
+        self.ext_len = config.ext_len
+        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
+
+        self.attn_type = config.attn_type
+
+        self.layers = []
+        if config.attn_type == 0: # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    TFRelPartialLearnableDecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if self.untie_r else self.r_w_bias,
+                        r_r_bias=None if self.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions,
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                        init_std=config.init_std,
+                        name='layers_._{}'.format(i))
+                )
+        else: # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0: # default attention
+            self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
+        else: # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+    def build(self, input_shape):
+        if not self.untie_r:
+            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                            initializer='zeros',
+                                            trainable=True,
+                                            name='r_w_bias')
+            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                            initializer='zeros',
+                                            trainable=True,
+                                            name='r_r_bias')
+        super(TFTransfoXLMainLayer, self).build(input_shape)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        return self.word_emb
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def _prune_heads(self, heads):
+        raise NotImplementedError
+
+    def init_mems(self, data):
+        if self.mem_len > 0:
+            mems = []
+            for i in range(self.n_layer):
+                empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model])
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        new_mems = []
+        end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+        beg_idx = max(0, end_idx - self.mem_len)
+        for i in range(len(hids)):
+
+            cat = tf.concat([mems[i], hids[i]], axis=0)
+            tf.stop_gradient(cat)
+            new_mems.append(cat[beg_idx:end_idx])
+
+        return new_mems
+
+    def call(self, inputs, mems=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            mems = inputs[1] if len(inputs) > 1 else mems
+            head_mask = inputs[2] if len(inputs) > 2 else head_mask
+            assert len(inputs) <= 3, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            mems = inputs.get('mems', mems)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 3, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = tf.transpose(input_ids, perm=(1, 0))
+
+        if mems is None:
+            mems = self.init_mems(input_ids)
+
+        qlen, bsz = shape_list(input_ids)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layer
+
+        word_emb = self.word_emb(input_ids)
+
+        mlen = shape_list(mems[0])[0] if mems is not None else 0
+        klen = mlen + qlen
+
+        attn_mask = tf.ones([qlen, qlen])
+        mask_u = tf.linalg.band_part(attn_mask, 0, -1)
+        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
+        attn_mask_pad = tf.zeros([qlen, mlen])
+        dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+        if self.same_length:
+            mask_l = tf.linalg.band_part(attn_mask, -1, 0)
+            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
+                                       dec_attn_mask[:, qlen:]], 1)
+        # ::: PyTorch masking code for reference :::
+        # if self.same_length:
+        #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
+        #     mask_len = klen - self.mem_len
+        #     if mask_len > 0:
+        #         mask_shift_len = qlen - mask_len
+        #     else:
+        #         mask_shift_len = qlen
+        #     dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+        #             + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
+        # else:
+        #     dec_attn_mask = torch.triu(
+        #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
+
+        hids = []
+        attentions = []
+        if self.attn_type == 0: # default
+            pos_seq = tf.range(klen-1, -1, -1.0)
+            if self.clamp_len > 0:
+                pos_seq = tf.minimum(pos_seq, self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb, training=training)
+            pos_emb = self.drop(pos_emb, training=training)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
+                                       mems_i, head_mask[i]], training=training)
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
+        else: # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        core_out = self.drop(core_out, training=training)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
+        if self.output_hidden_states:
+            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
+            hids.append(core_out)
+            hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
+            outputs.append(hids)
+        if self.output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
+            outputs.append(attentions)
+        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
+
+
+class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = TransfoXLConfig
+    pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_transfo_xl_pt_weights_in_tf2
+    base_model_prefix = "transformer"
+
+
+TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
+    `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
+    by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+    It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
+    previously computed hidden-states to attend to longer context (memory).
+    This model also uses adaptive softmax inputs and outputs (tied).
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
+        https://arxiv.org/abs/1901.02860
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
+            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mems**: (`optional`)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import TransfoXLTokenizer, TFTransfoXLModel
+
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states, mems = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+    (adaptive softmax with weights tied to the adaptive input embeddings)""",
+    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            We don't output them when the loss is computed to speedup adaptive softmax decoding.
+        **mems**:
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
+
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, mems = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(TFTransfoXLLMHeadModel, self).__init__(config)
+        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
+        self.sample_softmax = config.sample_softmax
+        # use sampled softmax
+        if config.sample_softmax > 0:
+            raise NotImplementedError
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
+                                              config.cutoffs, div_val=config.div_val, name='crit')
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.transformer.reset_length(tgt_len, ext_len, mem_len)
+
+    def init_mems(self, data):
+        return self.transformer.init_mems(data)
+
+    def call(self, inputs, mems=None, head_mask=None, labels=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            mems = inputs[1] if len(inputs) > 1 else mems
+            head_mask = inputs[2] if len(inputs) > 2 else head_mask
+            labels = inputs[3] if len(inputs) > 3 else labels
+            assert len(inputs) <= 4, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            mems = inputs.get('mems', mems)
+            head_mask = inputs.get('head_mask', head_mask)
+            labels = inputs.get('labels', labels)
+            assert len(inputs) <= 4, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        bsz, tgt_len = shape_list(input_ids)[:2]
+
+        transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training)
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+        outputs = transformer_outputs[1:]
+        if self.sample_softmax > 0 and training:
+            raise NotImplementedError
+        else:
+            # pred_hid = tf.reshape(pred_hid, (-1, shape_list(pred_hid)[-1]))
+            softmax_output = self.crit([pred_hid, labels], training=training)
+            # softmax_output = tf.reshape(softmax_output, (bsz, tgt_len, -1))
+            outputs = [softmax_output] + outputs
+
+        return outputs  # logits, new_mems, (all hidden states), (all attentions)
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
new file mode 100644
index 00000000000..d7666a650e1
--- /dev/null
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" A TF 2.0 Adaptive Softmax for Transformer XL model.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import tensorflow as tf
+
+from .modeling_tf_utils import shape_list
+
+class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False, **kwargs):
+        super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+        self.keep_order = keep_order
+
+        self.out_layers = []
+        self.out_projs = []
+
+    def build(self, input_shape):
+        if self.n_clusters > 0:
+            self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
+                                                  initializer='zeros',
+                                                  trainable=True,
+                                                  name='cluster_weight')
+            self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
+                                                initializer='zeros',
+                                                trainable=True,
+                                                name='cluster_bias')
+
+        if self.div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if self.d_proj != self.d_embed:
+                    weight = self.add_weight(shape=(self.d_embed, self.d_proj),
+                                             initializer='zeros',
+                                             trainable=True,
+                                             name='out_projs_._{}'.format(i))
+                    self.out_projs.append(weight)
+                else:
+                    self.out_projs.append(None)
+                weight = self.add_weight(shape=(self.n_token, self.d_embed,),
+                                         initializer='zeros',
+                                         trainable=True,
+                                         name='out_layers_._{}_._weight'.format(i))
+                bias = self.add_weight(shape=(self.n_token,),
+                                         initializer='zeros',
+                                         trainable=True,
+                                         name='out_layers_._{}_._bias'.format(i))
+                self.out_layers.append((weight, bias))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = self.d_embed // (self.div_val ** i)
+
+                weight = self.add_weight(shape=(d_emb_i, self.d_proj),
+                                         initializer='zeros',
+                                         trainable=True,
+                                         name='out_projs_._{}'.format(i))
+                self.out_projs.append(weight)
+                weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
+                                         initializer='zeros',
+                                         trainable=True,
+                                         name='out_layers_._{}_._weight'.format(i))
+                bias = self.add_weight(shape=(r_idx-l_idx,),
+                                         initializer='zeros',
+                                         trainable=True,
+                                         name='out_layers_._{}_._bias'.format(i))
+                self.out_layers.append((weight, bias))
+        super(TFAdaptiveSoftmaxMask, self).build(input_shape)
+
+    @staticmethod
+    def _logit(x, W, b, proj=None):
+        y = x
+        if proj is not None:
+            y = tf.einsum('ibd,ed->ibe', y, proj)
+        return tf.einsum('ibd,nd->ibn', y, W) + b
+
+    @staticmethod
+    def _gather_logprob(logprob, target):
+        lp_size = tf.shape(logprob)
+        r = tf.range(lp_size[0])
+        idx = tf.stack([r, target], 1)
+        return tf.gather_nd(logprob, idx)
+
+    def call(self, inputs, return_mean=True, training=False):
+        hidden, target = inputs
+        head_logprob = 0
+        if self.n_clusters == 0:
+            softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
+            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
+            if target is not None:
+                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
+            out = tf.nn.log_softmax(output, axis=-1)
+        else:
+            hidden_sizes = shape_list(hidden)
+            out = []
+            loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                if target is not None:
+                    mask = (target >= l_idx) & (target < r_idx)
+                    mask_idx = tf.where(mask)
+                    cur_target = tf.boolean_mask(target, mask) - l_idx
+
+                if self.div_val == 1:
+                    cur_W = self.out_layers[0][0][l_idx:r_idx]
+                    cur_b = self.out_layers[0][1][l_idx:r_idx]
+                else:
+                    cur_W = self.out_layers[i][0]
+                    cur_b = self.out_layers[i][1]
+
+                if i == 0:
+                    cur_W = tf.concat([cur_W, self.cluster_weight], 0)
+                    cur_b = tf.concat([cur_b, self.cluster_bias], 0)
+
+                    head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
+                    head_logprob = tf.nn.log_softmax(head_logit)
+                    out.append(head_logprob[..., :self.cutoffs[0]])
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
+                else:
+                    tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
+                    tail_logprob = tf.nn.log_softmax(tail_logit)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
+                    out.append(logprob_i)
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
+                        cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
+                if target is not None:
+                    loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
+            out = tf.concat(out, axis=-1)
+
+        if target is not None:
+            if return_mean:
+                loss = tf.reduce_mean(loss)
+            # Add the training-time loss value to the layer using `self.add_loss()`.
+            self.add_loss(loss)
+
+            # Log the loss as a metric (we could log arbitrary metrics,
+            # including different metrics for training and inference.
+            self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
+
+        return out
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
new file mode 100644
index 00000000000..06a333af37c
--- /dev/null
+++ b/transformers/modeling_tf_utils.py
@@ -0,0 +1,485 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF general model utils."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+
+import tensorflow as tf
+
+from .configuration_utils import PretrainedConfig
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+
+logger = logging.getLogger(__name__)
+
+
+class TFPreTrainedModel(tf.keras.Model):
+    r""" Base class for all TF models.
+
+        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+    """
+    config_class = None
+    pretrained_model_archive_map = {}
+    load_pt_weights = lambda model, config, path: None
+    base_model_prefix = ""
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        # Save config in model
+        self.config = config
+
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Variable from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``tf.Variable``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        # if new_num_tokens is None:
+        #     return old_embeddings
+
+        # old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        # if old_num_tokens == new_num_tokens:
+        #     return old_embeddings
+
+        # # Build new embeddings
+        # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        # new_embeddings.to(old_embeddings.weight.device)
+
+        # # initialize all new embeddings (in particular added tokens)
+        # self._init_weights(new_embeddings)
+
+        # # Copy word embeddings from the previous weights
+        # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        # return new_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+
+        Arguments:
+
+            new_num_tokens: (`optional`) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
+
+        Return: ``tf.Variable``
+            Pointer to the input tokens Embeddings Module of the model
+        """
+        raise NotImplementedError
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the base model.
+
+            Arguments:
+
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+        """
+        raise NotImplementedError
+
+    def save_pretrained(self, save_directory):
+        """ Save a model and its configuration file to a directory, so that it
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # Save configuration file
+        self.config.save_pretrained(save_directory)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
+        self.save_weights(output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
+
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with ``model.train()``
+
+        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
+        It is up to you to train those weights with a downstream fine-tuning task.
+
+        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            from_pt: (`optional`) boolean, default False:
+                Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
+
+        """
+        config = kwargs.pop('config', None)
+        cache_dir = kwargs.pop('cache_dir', None)
+        from_pt = kwargs.pop('from_pt', False)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+
+        # Load config
+        if config is None:
+            config, model_kwargs = cls.config_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args,
+                cache_dir=cache_dir, return_unused_kwargs=True,
+                force_download=force_download,
+                **kwargs
+            )
+        else:
+            model_kwargs = kwargs
+
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            elif os.path.isdir(pretrained_model_name_or_path):
+                if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                    # Load from a TF 2.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
+                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
+                        pretrained_model_name_or_path))
+            elif os.path.isfile(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            else:
+                raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
+
+            # redirect to the cache, if necessary
+            try:
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            except EnvironmentError as e:
+                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                    logger.error(
+                        "Couldn't reach server at '{}' to download pretrained weights.".format(
+                            archive_file))
+                else:
+                    logger.error(
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url but couldn't find any file "
+                        "associated to this path or url.".format(
+                            pretrained_model_name_or_path,
+                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            archive_file))
+                raise e
+            if resolved_archive_file == archive_file:
+                logger.info("loading weights file {}".format(archive_file))
+            else:
+                logger.info("loading weights file {} from cache at {}".format(
+                    archive_file, resolved_archive_file))
+        else:
+            resolved_archive_file = None
+
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+
+        if from_pt:
+            # Load from a PyTorch checkpoint
+            return cls.load_pt_weights(model, resolved_archive_file)
+
+        inputs = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        ret = model(inputs, training=False)  # build the network with dummy inputs
+
+        assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
+        # 'by_name' allow us to do transfer learning by skipping/adding layers
+        # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
+        model.load_weights(resolved_archive_file, by_name=True)
+
+        ret = model(inputs, training=False)  # Make sure restore ops are run
+
+        return model
+
+class TFConv1D(tf.keras.layers.Layer):
+    def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
+        """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super(TFConv1D, self).__init__(**kwargs)
+        self.nf = nf
+        self.nx = nx
+        self.initializer_range = initializer_range
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            "weight",
+            shape=[self.nx, self.nf],
+            initializer=get_initializer(self.initializer_range))
+        self.bias = self.add_weight(
+            "bias",
+            shape=[1, self.nf],
+            initializer=tf.zeros_initializer())
+
+    def call(self, x):
+        bz, sl = shape_list(x)[:2]
+
+        x = tf.reshape(x, [-1, self.nx])
+        x = tf.matmul(x, self.weight) + self.bias
+
+        x = tf.reshape(x, [bz, sl, self.nf])
+
+        return x
+
+
+class TFSharedEmbeddings(tf.keras.layers.Layer):
+    """Construct shared token embeddings.
+    """
+    def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
+        super(TFSharedEmbeddings, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+
+    def build(self, input_shape):
+        """Build shared word embedding layer
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        self.weight = self.add_weight(
+            "weight",
+            shape=[self.vocab_size, self.hidden_size],
+            initializer=get_initializer(self.initializer_range))
+        super(TFSharedEmbeddings, self).build(input_shape)
+
+    def call(self, inputs, mode="embedding"):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+        
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, input_ids):
+        """Applies embedding based on inputs tensor."""
+        return tf.gather(self.weight, input_ids)
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [..., hidden_size]
+            Returns:
+                float32 tensor with shape [..., vocab_size].
+        """
+        first_dims = shape_list(inputs)[:-1]
+
+        x = tf.reshape(inputs, [-1, self.hidden_size])
+        logits = tf.matmul(x, self.weight, transpose_b=True)
+
+        return tf.reshape(logits, first_dims + [self.vocab_size])
+
+
+class TFSequenceSummary(tf.keras.layers.Layer):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+    def __init__(self, config, initializer_range=0.02, **kwargs):
+        super(TFSequenceSummary, self).__init__(**kwargs)
+
+        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        if self.summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = None
+        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = tf.keras.layers.Dense(num_classes,
+                                                 kernel_initializer=get_initializer(initializer_range),
+                                                 name='summary')
+
+        self.activation = None
+        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+            self.activation = tf.keras.activations.tanh
+
+        self.first_dropout = None
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = None
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
+
+    def call(self, inputs, training=False):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'cls_index' and cls_index is None:
+                    we take the last token of the sequence as classification token
+        """
+        if not isinstance(inputs, (dict, tuple, list)):
+            hidden_states = inputs
+            cls_index = None
+        elif isinstance(inputs, (tuple, list)):
+            hidden_states = inputs[0]
+            cls_index = inputs[1] if len(inputs) > 1 else None
+            assert len(inputs) <= 2, "Too many inputs."
+        else:
+            input_ids = inputs.get('input_ids')
+            cls_index = inputs.get('cls_index', None)
+
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = tf.mean(hidden_states, axis=1)
+        elif self.summary_type == 'cls_index':
+            hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
+            if cls_index is None:
+                cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1)  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
+            cls_shape = shape_list(cls_index)
+            if len(cls_shape) <= len(hidden_shape) - 2:
+                cls_index = cls_index[..., tf.newaxis]
+            # else:
+                # cls_index = cls_index[..., tf.newaxis]
+                # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
+            output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size)
+        elif self.summary_type == 'attn':
+            raise NotImplementedError
+
+        if training and self.first_dropout is not None:
+            output = self.first_dropout(output)
+
+        if self.summary is not None:
+            output = self.summary(output)
+
+        if self.activation is not None:
+            output = self.activation(output)
+
+        if training and self.last_dropout is not None:
+            output = self.last_dropout(output)
+
+        return output
+
+def shape_list(x):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+
+def get_initializer(initializer_range=0.02):
+  """Creates a `tf.initializers.truncated_normal` with the given range.
+  Args:
+    initializer_range: float, initializer range for stddev.
+  Returns:
+    TruncatedNormal initializer with stddev = `initializer_range`.
+  """
+  return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
new file mode 100644
index 00000000000..f8f199bbe66
--- /dev/null
+++ b/transformers/modeling_tf_xlm.py
@@ -0,0 +1,746 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 XLM model.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+import os
+
+import itertools
+import numpy as np
+import tensorflow as tf
+
+from .configuration_xlm import XLMConfig
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+logger = logging.getLogger(__name__)
+
+TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
+}
+
+
+def load_xlm_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    # build the network
+    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+    if tf_model.config.use_lang_emb and tf_model.config.n_langs > 1:
+        langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+    else:
+        langs_list = None
+    tf_inputs = [inputs_list, attns_list, langs_list]
+    tfo = tf_model(tf_inputs, training=False)
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
+
+
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+
+
+def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = shape_list(lengths)[0]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        # assert lengths.max().item() <= slen
+        alen = tf.range(slen)
+        mask = tf.math.less(alen, lengths[:, tf.newaxis])
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)),
+                                  alen[tf.newaxis, :, tf.newaxis])
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert shape_list(mask) == [bs, slen]
+    assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
+
+    mask = tf.cast(mask, dtype=dtype)
+    attn_mask = tf.cast(attn_mask, dtype=dtype)
+
+    return mask, attn_mask
+
+
+class TFMultiHeadAttention(tf.keras.layers.Layer):
+
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config, **kwargs):
+        super(TFMultiHeadAttention, self).__init__(**kwargs)
+        self.layer_id = next(TFMultiHeadAttention.NEW_ID)
+        self.output_attentions = config.output_attentions
+        self.dim = dim
+        self.n_heads = n_heads
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
+        self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
+        self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
+        self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
+        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, inputs, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        input, mask, kv, cache, head_mask = inputs
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = shape_list(kv)[1]
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """  projection """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """  compute context """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
+        scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
+        scores = scores - 1e30 * (1.0 - mask)
+
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        outputs = (self.out_lin(context),)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+class TFTransformerFFN(tf.keras.layers.Layer):
+
+    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
+        super(TFTransformerFFN, self).__init__(**kwargs)
+        self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
+        self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
+        self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+        return x
+
+
+class TFXLMMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXLMMainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim       # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads   # 8 by default
+        self.n_layers = config.n_layers
+        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+
+        # embeddings
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
+
+        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
+                                                             self.dim,
+                                                             embeddings_initializer=get_initializer(config.embed_init_std),
+                                                             name='position_embeddings')
+        if config.sinusoidal_embeddings:
+            raise NotImplementedError
+            # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
+                                                             self.dim,
+                                                             embeddings_initializer=get_initializer(config.embed_init_std),
+                                                             name='lang_embeddings')
+        self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings')  # padding_idx=self.pad_index)
+        self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
+
+        # transformer layers
+        self.attentions = []
+        self.layer_norm1 = []
+        self.ffns = []
+        self.layer_norm2 = []
+        # if self.is_decoder:
+        #     self.layer_norm15 = []
+        #     self.encoder_attn = []
+
+        for i in range(self.n_layers):
+            self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i)))
+            self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i)))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i)))
+            self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i)))
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
+             position_ids=None, lengths=None, cache=None, head_mask=None,
+             training=False):  # removed: src_enc=None, src_len=None
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            langs = inputs[2] if len(inputs) > 2 else langs
+            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
+            position_ids = inputs[4] if len(inputs) > 4 else position_ids
+            lengths = inputs[5] if len(inputs) > 5 else lengths
+            cache = inputs[6] if len(inputs) > 6 else cache
+            head_mask = inputs[7] if len(inputs) > 7 else head_mask
+            assert len(inputs) <= 8, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            langs = inputs.get('langs', langs)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            lengths = inputs.get('lengths', lengths)
+            cache = inputs.get('cache', cache)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 8, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if lengths is None:
+            lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        bs, slen = shape_list(input_ids)
+        assert shape_list(lengths)[0] == bs
+        # assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(slen), axis=0)
+        else:
+            assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None:
+            _slen = slen - cache['slen']
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        tensor = self.embeddings(input_ids)
+        tensor = tensor + self.position_embeddings(position_ids)
+        if langs is not None and self.use_lang_emb:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor, training=training)
+        tensor = tensor * mask[..., tf.newaxis]
+
+        # transformer layers
+        hidden_states = ()
+        attentions = ()
+        for i in range(self.n_layers):
+            if self.output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
+            attn = attn_outputs[0]
+            if self.output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+            attn = self.dropout(attn, training=training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor = tensor * mask[..., tf.newaxis]
+
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache['slen'] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        outputs = (tensor,)
+        if self.output_hidden_states:
+            outputs = outputs + (hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (attentions,)
+        return outputs  # outputs, (hidden_states), (attentions)
+
+
+class TFXLMPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XLMConfig
+    pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_xlm_pt_weights_in_tf2
+    base_model_prefix = "transformer"
+
+
+XLM_START_DOCSTRING = r"""    The XLM model was proposed in
+    `Cross-lingual Language Model Pretraining`_
+    by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
+
+        - a causal language modeling (CLM) objective (next token prediction),
+        - a masked language modeling (MLM) objective (Bert-like), or
+        - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
+
+    Original code can be found `here`_.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Cross-lingual Language Model Pretraining`:
+        https://arxiv.org/abs/1901.07291
+
+    .. _`here`:
+        https://github.com/facebookresearch/XLM
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+
+            XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.XLMTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **langs**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens to be used to indicate the language of each token in the input.
+            Indices are languages ids which can be obtained from the language names by using two conversion mappings
+            provided in the configuration of the model (only provided for multilingual models).
+            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
+            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
+        **token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **lengths**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size,)``:
+            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
+            Indices selected in ``[0, ..., input_ids.size(-1)]``:
+        **cache**:
+            dictionary with ``Numpy array`` or ``tf.Tensor`` that contains pre-computed
+            hidden-states (key and values in the attention blocks) as computed by the model
+            (see `cache` output below). Can be used to speed up sequential decoding.
+            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class TFXLMModel(TFXLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMModel
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+
+class TFXLMPredLayer(tf.keras.layers.Layer):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFXLMPredLayer, self).__init__(**kwargs)
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        if config.asm is False:
+            self.input_embeddings = input_embeddings
+        else:
+            raise NotImplementedError
+            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+            #     in_features=dim,
+            #     n_classes=config.n_words,
+            #     cutoffs=config.asm_cutoffs,
+            #     div_value=config.asm_div_value,
+            #     head_bias=True,  # default is False
+            # )
+
+    def build(self, input_shape):
+        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
+        self.bias = self.add_weight(shape=(self.n_words,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFXLMPredLayer, self).build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name='transformer')
+        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
+
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output)
+        outputs = (outputs,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMForSequenceClassification
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.constant([1])[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLMMainLayer(config, name='transformer')
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        output = transformer_outputs[0]
+
+        logits = self.sequence_summary(output)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name='transformer')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.init_std),
+                                                name='qa_outputs')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
new file mode 100644
index 00000000000..9370bd0915a
--- /dev/null
+++ b/transformers/modeling_tf_xlnet.py
@@ -0,0 +1,1089 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 XLNet model.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_xlnet import XLNetConfig
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
+from .file_utils import add_start_docstrings
+from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
+
+
+logger = logging.getLogger(__name__)
+
+TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
+}
+
+
+def load_xlnet_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
+    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+    tf_inputs = tf.constant(inputs_list)
+    tfo = tf_model(tf_inputs, training=False)  # build the network
+    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
+
+
+def gelu(x):
+    """ Implementation of the gelu activation function.
+        XLNet is using OpenAI GPT's gelu
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+
+def swish(x):
+    return x * tf.sigmoid(x)
+
+
+ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
+          "relu": tf.keras.activations.relu,
+          "swish": tf.keras.layers.Activation(swish)}
+
+
+class TFXLNetRelativeAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXLNetRelativeAttention, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+
+        if config.d_model % config.n_head != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.d_model, config.n_head))
+
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head ** 0.5)
+        self.initializer_range = config.initializer_range
+
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+
+    def build(self, input_shape):
+        initializer = get_initializer(self.initializer_range)
+        self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
+                                 initializer=initializer,
+                                 trainable=True, name='q')
+        self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
+                                 initializer=initializer,
+                                 trainable=True, name='k')
+        self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
+                                 initializer=initializer,
+                                 trainable=True, name='v')
+        self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
+                                 initializer=initializer,
+                                 trainable=True, name='o')
+        self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
+                                 initializer=initializer,
+                                 trainable=True, name='r')
+        self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                        initializer='zeros',
+                                        trainable=True, name='r_r_bias')
+        self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                        initializer='zeros',
+                                        trainable=True, name='r_s_bias')
+        self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
+                                        initializer='zeros',
+                                        trainable=True, name='r_w_bias')
+        self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
+                                        initializer=initializer,
+                                        trainable=True, name='seg_embed')
+        super(TFXLNetRelativeAttention, self).build(input_shape)
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def rel_shift(x, klen=-1):
+        """perform relative shift to form the relative attention score."""
+        x_size = shape_list(x)
+
+        x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
+        x = x[1:, ...]
+        x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
+        x = x[:, 0:klen, :, :]
+        # x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
+
+        return x
+
+    def rel_attn_core(self, inputs, training=False):
+        """Core relative positional attention operations."""
+
+        q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
+
+        # content based attention score
+        ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
+
+        # position based attention score
+        bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
+        bd = self.rel_shift(bd, klen=ac.shape[1])
+
+        # segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
+            ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef)
+
+        # merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+        if attn_mask is not None:
+            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+            if attn_mask.dtype == tf.float16:
+                attn_score = attn_score - 65500 * attn_mask
+            else:
+                attn_score = attn_score - 1e30 * attn_mask
+
+        # attention probability
+        attn_prob = tf.nn.softmax(attn_score, axis=1)
+
+        attn_prob = self.dropout(attn_prob, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # attention output
+        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
+
+        if self.output_attentions:
+            return attn_vec, attn_prob
+
+        return attn_vec
+
+    def post_attention(self, inputs, residual=True, training=False):
+        """Post-attention processing."""
+        # post-attention projection (back to `d_model`)
+        h, attn_vec = inputs
+
+        attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+
+        attn_out = self.dropout(attn_out, training=training)
+
+        if residual:
+            attn_out = attn_out + h
+        output = self.layer_norm(attn_out)
+
+        return output
+
+    def call(self, inputs, training=False):
+        (h, g, attn_mask_h, attn_mask_g,
+         r, seg_mat, mems, target_mapping, head_mask) = inputs
+
+        if g is not None:
+            ###### Two-stream attention with relative positional encoding.
+            # content based attention score
+            if mems is not None and mems.shape.ndims > 1:
+                cat = tf.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # content-based key head
+            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
+
+            # content-based value head
+            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+
+            # position-based key head
+            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+
+            ##### h-stream
+            # content-stream query head
+            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
+
+            # core attention ops
+            attn_vec_h = self.rel_attn_core(
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
+                training=training)
+
+            if self.output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
+
+            # post processing
+            output_h = self.post_attention([h, attn_vec_h], training=training)
+
+            ##### g-stream
+            # query-stream query head
+            q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q)
+
+            # core attention ops
+            if target_mapping is not None:
+                q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                attn_vec_g = self.rel_attn_core(
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
+                    training=training)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+                attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
+                    training=training)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+            # post processing
+            output_g = self.post_attention([g, attn_vec_g], training=training)
+
+            if self.output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
+        else:
+            ###### Multi-head attention with relative positional encoding
+            if mems is not None and mems.shape.ndims > 1:
+                cat = tf.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # content heads
+            q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
+            k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
+            v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
+
+            # positional heads
+            k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
+
+            # core attention ops
+            attn_vec = self.rel_attn_core(
+                [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
+                training=training)
+
+            if self.output_attentions:
+                attn_vec, attn_prob = attn_vec
+
+            # post processing
+            output_h = self.post_attention([h, attn_vec], training=training)
+            output_g = None
+
+        outputs = (output_h, output_g)
+        if self.output_attentions:
+            outputs = outputs + (attn_prob,)
+        return outputs
+
+class TFXLNetFeedForward(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXLNetFeedForward, self).__init__(**kwargs)
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
+        self.layer_1 = tf.keras.layers.Dense(config.d_inner,
+                                             kernel_initializer=get_initializer(config.initializer_range),
+                                             name='layer_1')
+        self.layer_2 = tf.keras.layers.Dense(config.d_model,
+                                             kernel_initializer=get_initializer(config.initializer_range),
+                                             name='layer_2')
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str) or \
+                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+            self.activation_function = ACT2FN[config.ff_activation]
+        else:
+            self.activation_function = config.ff_activation
+
+    def call(self, inp, training=False):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output, training=training)
+        output = self.layer_2(output)
+        output = self.dropout(output, training=training)
+        output = self.layer_norm(output + inp)
+        return output
+
+class TFXLNetLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXLNetLayer, self).__init__(**kwargs)
+        self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn')
+        self.ff = TFXLNetFeedForward(config, name='ff')
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+
+    def call(self, inputs, training=False):
+        outputs = self.rel_attn(inputs, training=training)
+        output_h, output_g = outputs[:2]
+
+        if output_g is not None:
+            output_g = self.ff(output_g, training=training)
+        output_h = self.ff(output_h, training=training)
+
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
+        return outputs
+
+
+class TFXLNetLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super(TFXLNetLMHead, self).__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,),
+                                    initializer='zeros',
+                                    trainable=True,
+                                    name='bias')
+        super(TFXLNetLMHead, self).build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+class TFXLNetMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXLNetMainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
+        self.use_bfloat16 = config.use_bfloat16
+        self.initializer_range = config.initializer_range
+
+        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+
+    def build(self, input_shape):
+        initializer = get_initializer(self.initializer_range)
+        self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
+                                 initializer=initializer,
+                                 trainable=True, name='mask_emb')
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def create_mask(self, qlen, mlen, dtype=tf.float32):
+        """
+        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+
+        Args:
+            qlen: TODO Lysandre didn't fill
+            mlen: TODO Lysandre didn't fill
+
+        ::
+
+                  same_length=False:      same_length=True:
+                  <mlen > <  qlen >       <mlen > <  qlen >
+               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+
+        """
+        attn_mask = tf.ones([qlen, qlen], dtype=dtype)
+        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
+        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
+        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
+        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+        if self.same_length:
+            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
+            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        """cache hidden states into memory."""
+        if self.mem_len is None or self.mem_len == 0:
+            return None
+        else:
+            if self.reuse_len is not None and self.reuse_len > 0:
+                curr_out = curr_out[:self.reuse_len]
+
+            if prev_mem is None:
+                new_mem = curr_out[-self.mem_len:]
+            else:
+                new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:]
+
+        return tf.stop_gradient(new_mem)
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
+        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = tf.tile(pos_emb, [1, bsz, 1])
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None):
+        """create relative positional encoding."""
+        freq_seq = tf.range(0, self.d_model, 2.0)
+        if dtype is not None and dtype != tf.float32:
+            freq_seq = tf.cast(freq_seq, dtype=dtype)
+        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
+
+        if self.attn_type == 'bi':
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == 'uni':
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+
+        if self.bi_data:
+            fwd_pos_seq = tf.range(beg, end, -1.0)
+            bwd_pos_seq = tf.range(-beg, -end, 1.0)
+
+            if dtype is not None and dtype != tf.float32:
+                fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
+                bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
+                bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                # With bi_data, the batch size should be divisible by 2.
+                assert bsz%2 == 0
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
+        else:
+            fwd_pos_seq = tf.range(beg, end, -1.0)
+            if dtype is not None and dtype != tf.float32:
+                fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
+            if self.clamp_len > 0:
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        return pos_emb
+
+    def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+            token_type_ids=None, input_mask=None, head_mask=None, training=False):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            mems = inputs[2] if len(inputs) > 2 else mems
+            perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
+            target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
+            token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
+            input_mask = inputs[6] if len(inputs) > 6 else input_mask
+            head_mask = inputs[7] if len(inputs) > 7 else head_mask
+            assert len(inputs) <= 8, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            mems = inputs.get('mems', mems)
+            perm_mask = inputs.get('perm_mask', perm_mask)
+            target_mapping = inputs.get('target_mapping', target_mapping)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            input_mask = inputs.get('input_mask', input_mask)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 8, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+
+        input_ids = tf.transpose(input_ids, perm=(1, 0))
+        token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
+        input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
+        attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
+        perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
+        target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
+
+        qlen, bsz = shape_list(input_ids)[:2]
+        mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
+        klen = mlen + qlen
+
+        dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32
+
+        ##### Attention mask
+        # causal attention mask
+        if self.attn_type == 'uni':
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == 'bi':
+            attn_mask = None
+        else:
+            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+
+        # data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
+                                dtype=dtype_float)
+            data_mask = tf.concat([mems_mask, data_mask], axis=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float)
+
+        if attn_mask is not None:
+            non_tgt_mask = -tf.eye(qlen, dtype=dtype_float)
+            non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1)
+            non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float)
+        else:
+            non_tgt_mask = None
+
+        ##### Word embeddings and prepare h & g hidden states
+        word_emb_k = self.word_embedding(input_ids)
+        output_h = self.dropout(word_emb_k, training=training)
+        if target_mapping is not None:
+            word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
+        # else:  # We removed the inp_q input which was same as target mapping
+        #     inp_q_ext = inp_q[:, :, None]
+        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q, training=training)
+        else:
+            output_g = None
+
+        ##### Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
+            cat_ids = tf.concat([mem_pad, token_type_ids], 0)
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = tf.cast(
+                tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
+                tf.int32)
+            seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
+        else:
+            seg_mat = None
+
+        ##### Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
+        pos_emb = self.dropout(pos_emb, training=training)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        new_mems = ()
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        attentions = []
+        hidden_states = []
+        for i, layer_module in enumerate(self.layer):
+            # cache new mems
+            new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
+            if self.output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask,
+                                    pos_emb, seg_mat, mems[i], target_mapping,
+                                    head_mask[i]], training=training)
+            output_h, output_g = outputs[:2]
+            if self.output_attentions:
+                attentions.append(outputs[2])
+
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+        output = self.dropout(output_g if output_g is not None else output_h, training=training)
+
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        outputs = (tf.transpose(output, perm=(1, 0, 2)), new_mems)
+        if self.output_hidden_states:
+            if output_g is not None:
+                hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
+            else:
+                hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
+            outputs = outputs + (hidden_states,)
+        if self.output_attentions:
+            attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
+            outputs = outputs + (attentions,)
+
+        return outputs  # outputs, new_mems, (hidden_states), (attentions)
+
+
+class TFXLNetPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XLNetConfig
+    pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_pt_weights = load_xlnet_pt_weights_in_tf2
+    base_model_prefix = "transformer"
+
+
+XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
+    `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
+    by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+    XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
+    to learn bidirectional contexts by maximizing the expected likelihood over all permutations
+    of the input sequence factorization order.
+
+    The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+
+    Do to the difficulty of training a fully auto-regressive model over various factorization order,
+    XLNet is pretrained using only a sub-set of the output tokens as target which are selected
+    with the `target_mapping` input.
+
+    To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+    `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
+        http://arxiv.org/abs/1906.08237
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLNET_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            XLNet is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **mems**: (`optional`)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+            To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
+            the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
+            instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
+        **perm_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
+            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
+            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
+            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
+            If None, each token attends to all the others (full bidirectional attention).
+            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
+        **target_mapping**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_predict, sequence_length)``:
+            Mask to indicate the output tokens to use.
+            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
+            Only used during pretraining for partial prediction or for sequential decoding (generation).
+        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
+            the important thing is that they should be different for tokens which belong to different segments.
+            The model will compute relative segment differences from the given type indices:
+            0 if the segment id of two tokens are the same, 1 if not.
+        **input_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
+            Kept for compatibility with the original code base.
+            You can only uses one of `input_mask` and `attention_mask`
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class TFXLNetModel(TFXLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetModel
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""XLNet Model with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mems**:
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetLMHeadModel
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+
+        # We show how to setup inputs to predict a next token using a bi-directional context.
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>"))[None, :]  # We will predict the masked token
+        perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1]))
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = tf.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
+        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+
+        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        hidden_state = transformer_outputs[0]
+        logits = self.lm_loss(hidden_state)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        return outputs  # return logits, mems, (hidden states), (attentions)
+
+
+@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **mems**:
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
+        self.logits_proj = tf.keras.layers.Dense(config.num_labels,
+                                                 kernel_initializer=get_initializer(config.initializer_range),
+                                                 name='logits_proj')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        output = transformer_outputs[0]
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        return outputs  # return logits, mems, (hidden states), (attentions)
+
+
+# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+#     the hidden-states output to compute `span start logits` and `span end logits`). """,
+#     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
+class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+
+# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+#     the hidden-states output to compute `span start logits` and `span end logits`). """,
+#     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
+#     r"""
+#     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+#         **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)``
+#             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+#         **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+#             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+#             Indices for the top config.start_n_top start token possibilities (beam-search).
+#         **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+#             ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+#             Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+#         **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+#             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+#             Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+#         **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+#             ``tf.Tensor`` of shape ``(batch_size,)``
+#             Log probabilities for the ``is_impossible`` label of the answers.
+#         **mems**:
+#             list of ``tf.Tensor`` (one for each layer):
+#             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+#             if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+#             See details in the docstring of the `mems` input above.
+#         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+#             list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+#             of shape ``(batch_size, sequence_length, hidden_size)``:
+#             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+#         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+#             list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+#             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+#     Examples::
+
+#         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+#         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+#         start_positions = tf.constant([1])
+#         end_positions = tf.constant([3])
+#         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+#         loss, start_scores, end_scores = outputs[:2]
+
+#     """
+#     def __init__(self, config, *inputs, **kwargs):
+#         super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+#         self.start_n_top = config.start_n_top
+#         self.end_n_top = config.end_n_top
+
+#         self.transformer = TFXLNetMainLayer(config, name='transformer')
+#         self.start_logits = TFPoolerStartLogits(config, name='start_logits')
+#         self.end_logits = TFPoolerEndLogits(config, name='end_logits')
+#         self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
+
+#     def call(self, inputs, training=False):
+#         transformer_outputs = self.transformer(inputs, training=training)
+#         hidden_states = transformer_outputs[0]
+#         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+#         outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+#         if start_positions is not None and end_positions is not None:
+#             # If we are on multi-GPU, let's remove the dimension added by batch splitting
+#             for x in (start_positions, end_positions, cls_index, is_impossible):
+#                 if x is not None and x.dim() > 1:
+#                     x.squeeze_(-1)
+
+#             # during training, compute the end logits based on the ground truth of the start position
+#             end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+#             loss_fct = CrossEntropyLoss()
+#             start_loss = loss_fct(start_logits, start_positions)
+#             end_loss = loss_fct(end_logits, end_positions)
+#             total_loss = (start_loss + end_loss) / 2
+
+#             if cls_index is not None and is_impossible is not None:
+#                 # Predict answerability from the representation of CLS and START
+#                 cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+#                 loss_fct_cls = nn.BCEWithLogitsLoss()
+#                 cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+#                 # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+#                 total_loss += cls_loss * 0.5
+
+#             outputs = (total_loss,) + outputs
+
+#         else:
+#             # during inference, compute the end logits based on beam search
+#             bsz, slen, hsz = hidden_states.size()
+#             start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+#             start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+#             start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+#             start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
+#             start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+#             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+#             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+#             end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+#             end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+#             end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+#             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+#             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+#             start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)  # get the representation of START as weighted sum of hidden states
+#             cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
+
+#             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+#         # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+#         # or (if labels are provided) (total_loss,)
+#         return outputs
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
similarity index 70%
rename from pytorch_transformers/modeling_transfo_xl.py
rename to transformers/modeling_transfo_xl.py
index 73b04eee605..6d430e18044 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -194,7 +194,7 @@ class PositionalEmbedding(nn.Module):
 
 
 class PositionwiseFF(nn.Module):
-    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
         super(PositionwiseFF, self).__init__()
 
         self.d_model = d_model
@@ -208,7 +208,7 @@ class PositionwiseFF(nn.Module):
             nn.Dropout(dropout),
         )
 
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
 
         self.pre_lnorm = pre_lnorm
 
@@ -229,102 +229,12 @@ class PositionwiseFF(nn.Module):
         return output
 
 
-
-class MultiHeadAttn(nn.Module):
-    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
-        super(MultiHeadAttn, self).__init__()
-
-        self.output_attentions = output_attentions
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_head = d_head
-        self.dropout = dropout
-
-        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
-        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
-
-        self.drop = nn.Dropout(dropout)
-        self.dropatt = nn.Dropout(dropatt)
-        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
-
-        self.layer_norm = nn.LayerNorm(d_model)
-
-        self.scale = 1 / (d_head ** 0.5)
-
-        self.pre_lnorm = pre_lnorm
-
-        if r_r_bias is None or r_w_bias is None: # Biases are not shared
-            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
-        else:
-            self.r_r_bias = r_r_bias
-            self.r_w_bias = r_w_bias
-
-    def forward(self, h, attn_mask=None, mems=None, head_mask=None):
-        ##### multihead attention
-        # [hlen x bsz x n_head x d_head]
-
-        if mems is not None:
-            c = torch.cat([mems, h], 0)
-        else:
-            c = h
-
-        if self.pre_lnorm:
-            ##### layer normalization
-            c = self.layer_norm(c)
-
-        head_q = self.q_net(h)
-        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
-
-        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
-        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
-        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
-
-        # [qlen x klen x bsz x n_head]
-        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
-        attn_score.mul_(self.scale)
-        if attn_mask is not None and torch.sum(attn_mask).item():
-            attn_mask = (attn_mask == 1)  # Switch to bool
-            if attn_mask.dim() == 2:
-                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
-            elif attn_mask.dim() == 3:
-                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
-
-        # [qlen x klen x bsz x n_head]
-        attn_prob = F.softmax(attn_score, dim=1)
-        attn_prob = self.dropatt(attn_prob)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_prob = attn_prob * head_mask
-
-        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
-        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
-        attn_vec = attn_vec.contiguous().view(
-            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
-
-        ##### linear projection
-        attn_out = self.o_net(attn_vec)
-        attn_out = self.drop(attn_out)
-
-        if self.pre_lnorm:
-            ##### residual connection
-            outputs = [h + attn_out]
-        else:
-            ##### residual connection + layer normalization
-            outputs = [self.layer_norm(h + attn_out)]
-
-        if self.output_attentions:
-            outputs.append(attn_prob)
-
-        return outputs
-
-class RelMultiHeadAttn(nn.Module):
+class RelPartialLearnableMultiHeadAttn(nn.Module):
     def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
                  tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None, output_attentions=False):
-        super(RelMultiHeadAttn, self).__init__()
+                 r_r_bias=None, r_w_bias=None, output_attentions=False,
+                 layer_norm_epsilon=1e-5):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__()
 
         self.output_attentions = output_attentions
         self.n_head = n_head
@@ -338,7 +248,7 @@ class RelMultiHeadAttn(nn.Module):
         self.dropatt = nn.Dropout(dropatt)
         self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
 
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
 
         self.scale = 1 / (d_head ** 0.5)
 
@@ -351,36 +261,9 @@ class RelMultiHeadAttn(nn.Module):
             self.r_r_bias = r_r_bias
             self.r_w_bias = r_w_bias
 
-    def _parallelogram_mask(self, h, w, left=False):
-        mask = torch.ones((h, w)).byte()
-        m = min(h, w)
-        mask[:m,:m] = torch.triu(mask[:m,:m])
-        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
 
-        if left:
-            return mask
-        else:
-            return mask.flip(0)
-
-    def _shift(self, x, qlen, klen, mask, left=False):
-        if qlen > 1:
-            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
-                                    device=x.device, dtype=x.dtype)
-        else:
-            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
-
-        if left:
-            mask = mask.flip(1)
-            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
-        else:
-            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
-
-        x = x_padded.masked_select(mask[:,:,None,None]) \
-                    .view(qlen, klen, x.size(2), x.size(3))
-
-        return x
-
-    def _rel_shift(self, x, zero_triu=False):
+    def _rel_shift(self, x):
         zero_pad_shape = (x.size(0), 1) + x.size()[2:]
         zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
         x_padded = torch.cat([zero_pad, x], dim=1)
@@ -390,21 +273,8 @@ class RelMultiHeadAttn(nn.Module):
 
         x = x_padded[1:].view_as(x)
 
-        if zero_triu:
-            ones = torch.ones((x.size(0), x.size(1)))
-            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
-
         return x
 
-    def forward(self, w, r, attn_mask=None, mems=None):
-        raise NotImplementedError
-
-class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
-    def __init__(self, *args, **kwargs):
-        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
-
-        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
-
     def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
         qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
 
@@ -496,148 +366,17 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
 
         return outputs
 
-class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
-    def __init__(self, *args, **kwargs):
-        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
-
-    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_mask=None):
-        # r_emb: [klen, n_head, d_head], used for term B
-        # r_w_bias: [n_head, d_head], used for term C
-        # r_bias: [klen, n_head], used for term D
-
-        qlen, bsz = w.size(0), w.size(1)
-
-        if mems is not None:
-            cat = torch.cat([mems, w], 0)
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(cat))
-            else:
-                w_heads = self.qkv_net(cat)
-            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
-
-            w_head_q = w_head_q[-qlen:]
-        else:
-            if self.pre_lnorm:
-                w_heads = self.qkv_net(self.layer_norm(w))
-            else:
-                w_heads = self.qkv_net(w)
-            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
-
-        klen = w_head_k.size(0)
-
-        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
-        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
-        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
-
-        if klen > r_emb.size(0):
-            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
-            r_emb = torch.cat([r_emb_pad, r_emb], 0)
-            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
-            r_bias = torch.cat([r_bias_pad, r_bias], 0)
-        else:
-            r_emb = r_emb[-klen:]
-            r_bias = r_bias[-klen:]
-
-        #### compute attention score
-        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
-
-        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
-        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
-        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
-        BD = self._rel_shift(B_ + D_)
-
-        # [qlen x klen x bsz x n_head]
-        attn_score = AC + BD
-        attn_score.mul_(self.scale)
-
-        #### compute attention probability
-        if attn_mask is not None and torch.sum(attn_mask).item():
-            attn_mask = (attn_mask == 1)  # Switch to bool
-            if attn_mask.dim() == 2:
-                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
-            elif attn_mask.dim() == 3:
-                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
-
-        # [qlen x klen x bsz x n_head]
-        attn_prob = F.softmax(attn_score, dim=1)
-        attn_prob = self.dropatt(attn_prob)
-
-        if head_mask is not None:
-            attn_prob = attn_prob * head_mask
-
-        #### compute attention vector
-        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
-
-        # [qlen x bsz x n_head x d_head]
-        attn_vec = attn_vec.contiguous().view(
-            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
-
-        ##### linear projection
-        attn_out = self.o_net(attn_vec)
-        attn_out = self.drop(attn_out)
-
-        if self.pre_lnorm:
-            ##### residual connection
-            outputs = [w + attn_out]
-        else:
-            ##### residual connection + layer normalization
-            outputs = [self.layer_norm(w + attn_out)]
-
-        if self.output_attentions:
-            outputs.append(attn_prob)
-
-        return outputs
-
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
-        super(DecoderLayer, self).__init__()
-
-        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
-                                     pre_lnorm=kwargs.get('pre_lnorm'))
-
-    def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
-
-        attn_outputs = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
-                               mems=mems, head_mask=head_mask)
-        ff_output = self.pos_ff(attn_outputs[0])
-
-        outputs = [ff_output] + attn_outputs[1:]
-
-        return outputs
-
-class RelLearnableDecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
-                 **kwargs):
-        super(RelLearnableDecoderLayer, self).__init__()
-
-        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
-                                         **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
-                                     pre_lnorm=kwargs.get('pre_lnorm'))
-
-    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
-
-        attn_outputs = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
-                               attn_mask=dec_attn_mask,
-                               mems=mems, head_mask=head_mask)
-        ff_output = self.pos_ff(attn_outputs[0])
-
-        outputs = [ff_output] + attn_outputs[1:]
-
-        return outputs
 
 class RelPartialLearnableDecoderLayer(nn.Module):
-    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5,
                  **kwargs):
         super(RelPartialLearnableDecoderLayer, self).__init__()
 
         self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
-                            d_head, dropout, **kwargs)
-        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
-                                     pre_lnorm=kwargs.get('pre_lnorm'))
+                            d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'),
+                                     layer_norm_epsilon=layer_norm_epsilon)
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
 
@@ -651,7 +390,6 @@ class RelPartialLearnableDecoderLayer(nn.Module):
         return outputs
 
 
-
 class AdaptiveEmbedding(nn.Module):
     def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
                  sample_softmax=False):
@@ -775,9 +513,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
             if hasattr(m, 'r_bias'):
                 self._init_bias(m.r_bias)
 
-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
 
 TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
     `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
@@ -796,9 +531,9 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
@@ -807,9 +542,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **mems**: (`optional`)
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -888,45 +623,19 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                         dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
                         r_r_bias=None if config.untie_r else self.r_r_bias,
-                        output_attentions=self.output_attentions)
-                )
-        elif config.attn_type == 1: # learnable embeddings
-            for i in range(config.n_layer):
-                self.layers.append(
-                    RelLearnableDecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
-                        r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias,
-                        output_attentions=self.output_attentions)
-                )
-        elif config.attn_type in [2, 3]: # absolute embeddings
-            for i in range(config.n_layer):
-                self.layers.append(
-                    DecoderLayer(
-                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
-                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
-                        r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias,
-                        output_attentions=self.output_attentions)
+                        output_attentions=self.output_attentions,
+                        layer_norm_epsilon=config.layer_norm_epsilon)
                 )
+        else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
+            raise NotImplementedError  # Removed them to avoid maintaining dead code
 
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
         if self.attn_type == 0: # default attention
             self.pos_emb = PositionalEmbedding(self.d_model)
-        elif self.attn_type == 1: # learnable
-            self.r_emb = nn.Parameter(torch.FloatTensor(
-                    self.n_layer, self.max_klen, self.n_head, self.d_head))
-            self.r_bias = nn.Parameter(torch.FloatTensor(
-                    self.n_layer, self.max_klen, self.n_head))
-        elif self.attn_type == 2: # absolute standard
-            self.pos_emb = PositionalEmbedding(self.d_model)
-        elif self.attn_type == 3: # absolute deeper SA
-            self.r_emb = nn.Parameter(torch.FloatTensor(
-                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+        else: # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         self.init_weights()
 
@@ -981,8 +690,15 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         return new_mems
 
-    def _forward(self, dec_inp, mems=None, head_mask=None):
-        qlen, bsz = dec_inp.size()
+    def forward(self, input_ids, mems=None, head_mask=None):
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = input_ids.transpose(0, 1).contiguous()
+
+        if mems is None:
+            mems = self.init_mems(input_ids)
+
+        qlen, bsz = input_ids.size()
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -999,7 +715,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         else:
             head_mask = [None] * self.n_layer
 
-        word_emb = self.word_emb(dec_inp)
+        word_emb = self.word_emb(input_ids)
 
         mlen = mems[0].size(0) if mems is not None else 0
         klen = mlen + qlen
@@ -1036,64 +752,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 core_out = layer_outputs[0]
                 if self.output_attentions:
                     attentions.append(layer_outputs[1])
-        elif self.attn_type == 1: # learnable
-            core_out = self.drop(word_emb)
-            for i, layer in enumerate(self.layers):
-                hids.append(core_out)
-                if self.clamp_len > 0:
-                    r_emb = self.r_emb[i][-self.clamp_len :]
-                    r_bias = self.r_bias[i][-self.clamp_len :]
-                else:
-                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
-
-                mems_i = None if mems is None else mems[i]
-                layer_outputs = layer(core_out, r_emb, self.r_w_bias[i],
-                                      r_bias, dec_attn_mask=dec_attn_mask,
-                                      mems=mems_i, head_mask=head_mask[i])
-                core_out = layer_outputs[0]
-                if self.output_attentions:
-                    attentions.append(layer_outputs[1])
-        elif self.attn_type == 2: # absolute
-            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
-                                   dtype=word_emb.dtype)
-            if self.clamp_len > 0:
-                pos_seq.clamp_(max=self.clamp_len)
-            pos_emb = self.pos_emb(pos_seq)
-
-            core_out = self.drop(word_emb + pos_emb[-qlen:])
-
-            for i, layer in enumerate(self.layers):
-                hids.append(core_out)
-                mems_i = None if mems is None else mems[i]
-                if mems_i is not None and i == 0:
-                    mems_i += pos_emb[:mlen]
-                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
-                                 mems=mems_i, head_mask=head_mask[i])
-                core_out = layer_outputs[0]
-                if self.output_attentions:
-                    attentions.append(layer_outputs[1])
-        elif self.attn_type == 3:
-            core_out = self.drop(word_emb)
-
-            for i, layer in enumerate(self.layers):
-                hids.append(core_out)
-                mems_i = None if mems is None else mems[i]
-                if mems_i is not None and mlen > 0:
-                    cur_emb = self.r_emb[i][:-qlen]
-                    cur_size = cur_emb.size(0)
-                    if cur_size < mlen:
-                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
-                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
-                    else:
-                        cur_emb = cur_emb[-mlen:]
-                    mems_i += cur_emb.view(mlen, 1, -1)
-                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
-
-                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
-                                      mems=mems_i, head_mask=head_mask[i])
-                core_out = layer_outputs[0]
-                if self.output_attentions:
-                    attentions.append(layer_outputs[1])
+        else: # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
 
         core_out = self.drop(core_out)
 
@@ -1110,16 +770,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
             attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
             outputs.append(attentions)
-        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
-
-    def forward(self, input_ids, mems=None, head_mask=None):
-        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
-        # so we transpose here from shape [bsz, len] to shape [len, bsz]
-        input_ids = input_ids.transpose(0, 1).contiguous()
-
-        if mems is None:
-            mems = self.init_mems(input_ids)
-        outputs = self._forward(input_ids, mems=mems, head_mask=head_mask)
 
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
diff --git a/pytorch_transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_transformers/modeling_transfo_xl_utilities.py
rename to transformers/modeling_transfo_xl_utilities.py
diff --git a/pytorch_transformers/modeling_utils.py b/transformers/modeling_utils.py
similarity index 81%
rename from pytorch_transformers/modeling_utils.py
rename to transformers/modeling_utils.py
index 25aeefe10f1..ae2df9514af 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
 
 logger = logging.getLogger(__name__)
 
@@ -52,16 +52,16 @@ except ImportError:
 class PreTrainedModel(nn.Module):
     r""" Base class for all models.
 
-        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
         as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
 
         Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
             - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
             - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
 
-                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                 - ``path``: a path (string) to the TensorFlow checkpoint.
 
             - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
@@ -189,7 +189,7 @@ class PreTrainedModel(nn.Module):
 
     def save_pretrained(self, save_directory):
         """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -201,8 +201,8 @@ class PreTrainedModel(nn.Module):
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-
         torch.save(model_to_save.state_dict(), output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -220,23 +220,24 @@ class PreTrainedModel(nn.Module):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -256,7 +257,7 @@ class PreTrainedModel(nn.Module):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -289,103 +290,125 @@ class PreTrainedModel(nn.Module):
             model_kwargs = kwargs
 
         # Load model
-        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
+        if pretrained_model_name_or_path is not None:
             if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            elif os.path.isdir(pretrained_model_name_or_path):
+                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
+                    # Load from a TF 1.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                    # Load from a TF 2.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
+                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
+                        pretrained_model_name_or_path))
+            elif os.path.isfile(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
             else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_model_archive_map.keys()),
-                        archive_file))
-            raise e
-        if resolved_archive_file == archive_file:
-            logger.info("loading weights file {}".format(archive_file))
+                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+                archive_file = pretrained_model_name_or_path + ".index"
+
+            # redirect to the cache, if necessary
+            try:
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            except EnvironmentError as e:
+                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                    logger.error(
+                        "Couldn't reach server at '{}' to download pretrained weights.".format(
+                            archive_file))
+                else:
+                    logger.error(
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url but couldn't find any file "
+                        "associated to this path or url.".format(
+                            pretrained_model_name_or_path,
+                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            archive_file))
+                raise e
+            if resolved_archive_file == archive_file:
+                logger.info("loading weights file {}".format(archive_file))
+            else:
+                logger.info("loading weights file {} from cache at {}".format(
+                    archive_file, resolved_archive_file))
         else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
+            resolved_archive_file = None
 
         # Instantiate model.
         model = cls(config, *model_args, **model_kwargs)
 
         if state_dict is None and not from_tf:
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
 
-        # Convert old format to new format if needed from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        # Load from a PyTorch state_dict
         missing_keys = []
         unexpected_keys = []
         error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
 
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
+        if from_tf:
+            if resolved_archive_file.endswith('.index'):
+                # Load from a TensorFlow 1.X checkpoint - provided by original authors
+                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+            else:
+                # Load from our TensorFlow 2.0 checkpoints
+                try:
+                    from transformers import load_tf2_checkpoint_in_pytorch_model
+                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
+                except ImportError as e:
+                    logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+                    raise e
+        else:
+            # Convert old format to new format if needed from a PyTorch state_dict
+            old_keys = []
+            new_keys = []
+            for key in state_dict.keys():
+                new_key = None
+                if 'gamma' in key:
+                    new_key = key.replace('gamma', 'weight')
+                if 'beta' in key:
+                    new_key = key.replace('beta', 'bias')
+                if new_key:
+                    old_keys.append(key)
+                    new_keys.append(new_key)
+            for old_key, new_key in zip(old_keys, new_keys):
+                state_dict[new_key] = state_dict.pop(old_key)
 
-        # Make sure we are able to load base models as well as derived models (with heads)
-        start_prefix = ''
-        model_to_load = model
-        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
-            start_prefix = cls.base_model_prefix + '.'
-        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
-            model_to_load = getattr(model, cls.base_model_prefix)
+            # copy state_dict so _load_from_state_dict can modify it
+            metadata = getattr(state_dict, '_metadata', None)
+            state_dict = state_dict.copy()
+            if metadata is not None:
+                state_dict._metadata = metadata
 
-        load(model_to_load, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
+            def load(module, prefix=''):
+                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+                module._load_from_state_dict(
+                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                for name, child in module._modules.items():
+                    if child is not None:
+                        load(child, prefix + name + '.')
+
+            # Make sure we are able to load base models as well as derived models (with heads)
+            start_prefix = ''
+            model_to_load = model
+            if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+                start_prefix = cls.base_model_prefix + '.'
+            if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+                model_to_load = getattr(model, cls.base_model_prefix)
+
+            load(model_to_load, prefix=start_prefix)
+            if len(missing_keys) > 0:
+                logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                    model.__class__.__name__, missing_keys))
+            if len(unexpected_keys) > 0:
+                logger.info("Weights from pretrained model not used in {}: {}".format(
+                    model.__class__.__name__, unexpected_keys))
+            if len(error_msgs) > 0:
+                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                                model.__class__.__name__, "\n\t".join(error_msgs)))
 
         if hasattr(model, 'tie_weights'):
             model.tie_weights()  # make sure word embedding weights are still tied
@@ -531,7 +554,7 @@ class SQuADHead(nn.Module):
     r""" A SQuAD head inspired by XLNet.
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
 
     Inputs:
         **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
@@ -682,7 +705,7 @@ class SequenceSummary(nn.Module):
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, cls_index=None):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+        """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
             cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                 shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                 if summary_type == 'cls_index' and cls_index is None:
diff --git a/pytorch_transformers/modeling_xlm.py b/transformers/modeling_xlm.py
similarity index 85%
rename from pytorch_transformers/modeling_xlm.py
rename to transformers/modeling_xlm.py
index 67866a30ddc..b29e7215564 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -63,7 +63,7 @@ def gelu(x):
     GELU activation
     https://arxiv.org/abs/1606.08415
     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
+    https://github.com/huggingface/transformers/blob/master/modeling.py
     """
     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
@@ -265,9 +265,9 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
         https://github.com/facebookresearch/XLM
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLM_INPUTS_DOCSTRING = r"""
@@ -278,9 +278,9 @@ XLM_INPUTS_DOCSTRING = r"""
             XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLMTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -337,11 +337,6 @@ class XLMModel(XLMPreTrainedModel):
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
-    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
-                  'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', 
-                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
-                  'asm_cutoffs', 'asm_div_value']
-
     def __init__(self, config):  #, dico, is_encoder, with_output):
         super(XLMModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -568,10 +563,10 @@ class XLMPredLayer(nn.Module):
         """
         outputs = ()
         if self.asm is False:
-            scores = self.proj(x).view(-1, self.n_words)
+            scores = self.proj(x)
             outputs = (scores,) + outputs
             if y is not None:
-                loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
                 outputs = (loss,) + outputs
         else:
             scores = self.proj.log_prob(x)
@@ -723,6 +718,101 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 @add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLMForQuestionAnsweringSimple, self).__init__(config)
+
+        self.transformer = XLMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,)
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMForQuestionAnswering(XLMPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
diff --git a/pytorch_transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
similarity index 90%
rename from pytorch_transformers/modeling_xlnet.py
rename to transformers/modeling_xlnet.py
index a4a300e0706..d6bb2ebd38c 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -488,9 +488,9 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLNET_INPUTS_DOCSTRING = r"""
@@ -499,9 +499,9 @@ XLNET_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             XLNet is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
             The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
@@ -531,8 +531,10 @@ XLNET_INPUTS_DOCSTRING = r"""
             Only used during pretraining for partial prediction or for sequential decoding (generation).
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+            The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
+            the important thing is that they should be different for tokens which belong to different segments.
+            The model will compute relative segment differences from the given type indices:
+            0 if the segment id of two tokens are the same, 1 if not.
         **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
@@ -1103,6 +1105,101 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, mems, (hidden states), (attentions)
 
 
+@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLNetForQuestionAnsweringSimple, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None,
+                start_positions=None, end_positions=None):
+
+        outputs = self.transformer(input_ids,
+                                    attention_mask=attention_mask,
+                                    mems=mems,
+                                    perm_mask=perm_mask,
+                                    target_mapping=target_mapping,
+                                    token_type_ids=token_type_ids,
+                                    input_mask=input_mask, 
+                                    head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+
+
 @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
diff --git a/pytorch_transformers/optimization.py b/transformers/optimization.py
similarity index 100%
rename from pytorch_transformers/optimization.py
rename to transformers/optimization.py
diff --git a/pytorch_transformers/tests/__init__.py b/transformers/tests/__init__.py
similarity index 100%
rename from pytorch_transformers/tests/__init__.py
rename to transformers/tests/__init__.py
diff --git a/pytorch_transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
similarity index 100%
rename from pytorch_transformers/tests/configuration_common_test.py
rename to transformers/tests/configuration_common_test.py
diff --git a/pytorch_transformers/tests/conftest.py b/transformers/tests/conftest.py
similarity index 100%
rename from pytorch_transformers/tests/conftest.py
rename to transformers/tests/conftest.py
diff --git a/pytorch_transformers/tests/fixtures/input.txt b/transformers/tests/fixtures/input.txt
similarity index 100%
rename from pytorch_transformers/tests/fixtures/input.txt
rename to transformers/tests/fixtures/input.txt
diff --git a/pytorch_transformers/tests/fixtures/sample_text.txt b/transformers/tests/fixtures/sample_text.txt
similarity index 100%
rename from pytorch_transformers/tests/fixtures/sample_text.txt
rename to transformers/tests/fixtures/sample_text.txt
diff --git a/pytorch_transformers/tests/fixtures/test_sentencepiece.model b/transformers/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from pytorch_transformers/tests/fixtures/test_sentencepiece.model
rename to transformers/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
similarity index 82%
rename from pytorch_transformers/tests/modeling_auto_test.py
rename to transformers/tests/modeling_auto_test.py
index dfdedbbe612..af1de29cce8 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -21,15 +21,20 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import (AutoConfig, BertConfig,
-                                  AutoModel, BertModel,
-                                  AutoModelWithLMHead, BertForMaskedLM,
-                                  AutoModelForSequenceClassification, BertForSequenceClassification,
-                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
-from .configuration_common_test import ConfigTester
+if is_torch_available():
+    from transformers import (AutoConfig, BertConfig,
+                                    AutoModel, BertModel,
+                                    AutoModelWithLMHead, BertForMaskedLM,
+                                    AutoModelForSequenceClassification, BertForSequenceClassification,
+                                    AutoModelForQuestionAnswering, BertForQuestionAnswering)
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .configuration_common_test import ConfigTester
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 
 class AutoModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_bert_test.py
rename to transformers/tests/modeling_bert_test.py
index 2919cc03368..633c97e263d 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -20,21 +20,26 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
-                                     BertForNextSentencePrediction, BertForPreTraining,
-                                     BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification, BertForMultipleChoice)
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
+if is_torch_available():
+    from transformers import (BertConfig, BertModel, BertForMaskedLM,
+                                        BertForNextSentencePrediction, BertForPreTraining,
+                                        BertForQuestionAnswering, BertForSequenceClassification,
+                                        BertForTokenClassification, BertForMultipleChoice)
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
+
 
 class BertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-            BertForTokenClassification)
+            BertForTokenClassification) if is_torch_available() else ()
 
     class BertModelTester(object):
 
@@ -305,7 +310,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
similarity index 96%
rename from pytorch_transformers/tests/modeling_common_test.py
rename to transformers/tests/modeling_common_test.py
index c6194fefcce..2b66757c285 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -25,12 +25,18 @@ import uuid
 
 import unittest
 import logging
+import pytest
 
-import torch
+from transformers import is_torch_available
 
-from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
-                                  BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+if is_torch_available():
+    import torch
+
+    from transformers import (PretrainedConfig, PreTrainedModel,
+                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 
 def _config_zero_init(config):
@@ -62,6 +68,16 @@ class CommonTestCases:
                         self.assertIn(param.data.mean().item(), [0.0, 1.0],
                         msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
 
+        def test_determinism(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.eval()
+                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                self.assertEqual(first.ne(second).sum().item(), 0)
+
+
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -195,6 +211,9 @@ class CommonTestCases:
                 hidden_states = outputs[-2]
 
                 # Remove Nan
+                for t in attentions:
+                    self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4)  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions]  # remove them (the test is less complete)
 
                 self.assertIsNotNone(multihead_outputs)
                 self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
@@ -602,7 +621,7 @@ class CommonTestCases:
                 [[], []])
 
         def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/pytorch_transformers_test/"
+            cache_dir = "/tmp/transformers_test/"
             for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
                 model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
                 shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_distilbert_test.py
rename to transformers/tests/modeling_distilbert_test.py
index 0d9f2311777..937d03396d5 100644
--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -17,9 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
+import pytest
 
-from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
-                                  DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+from transformers import is_torch_available
+
+if is_torch_available():
+    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -28,7 +34,7 @@ from .configuration_common_test import ConfigTester
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
-                         DistilBertForSequenceClassification)
+                         DistilBertForSequenceClassification) if is_torch_available() else None
     test_pruning = True
     test_torchscript = True
     test_resize_embeddings = True
@@ -205,7 +211,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     # @pytest.mark.slow
     # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
     #         shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
similarity index 76%
rename from pytorch_transformers/tests/modeling_gpt2_test.py
rename to transformers/tests/modeling_gpt2_test.py
index 2717805120e..4263e51bc97 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -20,9 +20,13 @@ import unittest
 import pytest
 import shutil
 
+from transformers import is_torch_available
 
-from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
+if is_torch_available():
+    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +34,7 @@ from .configuration_common_test import ConfigTester
 
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
+    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
 
     class GPT2ModelTester(object):
 
@@ -40,7 +44,9 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                      seq_length=7,
                      is_training=True,
                      use_token_type_ids=True,
+                     use_input_mask=True,
                      use_labels=True,
+                     use_mc_token_ids=True,
                      vocab_size=99,
                      hidden_size=32,
                      num_hidden_layers=5,
@@ -62,7 +68,9 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
             self.seq_length = seq_length
             self.is_training = is_training
             self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
             self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
             self.vocab_size = vocab_size
             self.hidden_size = hidden_size
             self.num_hidden_layers = num_hidden_layers
@@ -82,10 +90,18 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
             token_type_ids = None
             if self.use_token_type_ids:
                 token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
             sequence_labels = None
             token_labels = None
             choice_labels = None
@@ -111,14 +127,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
             head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
 
-            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
 
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
                 [])
 
-        def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2Model(config=config)
             model.eval()
 
@@ -135,7 +151,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.hidden_size])
             self.parent.assertEqual(len(result["presents"]), config.n_layer)
 
-        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2LMHeadModel(config)
             model.eval()
 
@@ -153,15 +169,27 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
 
-        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
             model = GPT2DoubleHeadsModel(config)
             model.eval()
 
-            loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids,
+                      'lm_labels': multiple_choice_inputs_ids}
+
+            loss, lm_logits, mc_logits, _ = model(**inputs)
 
             result = {
                 "loss": loss,
-                "lm_logits": lm_logits
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits
             }
 
             self.parent.assertListEqual(
@@ -169,11 +197,17 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 [])
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].size()),
+                [self.batch_size, self.num_choices])
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
             inputs_dict = {
                 'input_ids': input_ids,
                 'token_type_ids': token_type_ids,
@@ -203,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_openai_test.py
rename to transformers/tests/modeling_openai_test.py
index dbef6c52eb8..33218288a02 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -20,9 +20,13 @@ import unittest
 import pytest
 import shutil
 
+from transformers import is_torch_available
 
-from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+if is_torch_available():
+    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +34,7 @@ from .configuration_common_test import ConfigTester
 
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
 
     class OpenAIGPTModelTester(object):
 
@@ -201,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_roberta_test.py
rename to transformers/tests/modeling_roberta_test.py
index 69981af2227..82e10da915d 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -19,10 +19,15 @@ from __future__ import print_function
 import unittest
 import shutil
 import pytest
-import torch
 
-from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
-from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import is_torch_available
+
+if is_torch_available():
+    import torch
+    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +35,7 @@ from .configuration_common_test import ConfigTester
 
 class RobertaModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (RobertaForMaskedLM, RobertaModel)
+    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
 
     class RobertaModelTester(object):
 
@@ -175,7 +180,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
new file mode 100644
index 00000000000..2cda3abc1cd
--- /dev/null
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import logging
+
+from transformers import is_tf_available
+
+if is_tf_available():
+    from transformers import (AutoConfig, BertConfig,
+                                      TFAutoModel, TFBertModel,
+                                      TFAutoModelWithLMHead, TFBertForMaskedLM,
+                                      TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
+                                      TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
+    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .configuration_common_test import ConfigTester
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFAutoModelTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        import h5py
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModel.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertModel)
+
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForSequenceClassification)
+
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForQuestionAnswering)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
new file mode 100644
index 00000000000..a1715d25684
--- /dev/null
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import BertConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
+                                                       TFBertForNextSentencePrediction,
+                                                       TFBertForPreTraining,
+                                                       TFBertForSequenceClassification,
+                                                       TFBertForMultipleChoice,
+                                                       TFBertForTokenClassification,
+                                                       TFBertForQuestionAnswering,
+                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
+                         TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification,
+                         TFBertForTokenClassification) if is_tf_available() else ()
+
+    class TFBertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = BertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertModel(config=config)
+            # inputs = {'input_ids': input_ids,
+            #           'attention_mask': input_mask,
+            #           'token_type_ids': token_type_ids}
+            # sequence_output, pooled_output = model(**inputs)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForNextSentencePrediction(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            seq_relationship_score, = model(inputs)
+            result = {
+                "seq_relationship_score": seq_relationship_score.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].shape),
+                [self.batch_size, 2])
+
+
+        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForPreTraining(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, seq_relationship_score = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+                "seq_relationship_score": seq_relationship_score.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].shape),
+                [self.batch_size, 2])
+
+
+        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFBertForSequenceClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+
+
+        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_choices = self.num_choices
+            model = TFBertForMultipleChoice(config=config)
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_choices])
+
+
+        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFBertForTokenClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.num_labels])
+
+
+        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForQuestionAnswering(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFBertModelTest.TFBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
new file mode 100644
index 00000000000..483f031b16e
--- /dev/null
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import copy
+import json
+import logging
+import importlib
+import random
+import shutil
+import unittest
+import uuid
+
+import pytest
+import sys
+
+from transformers import is_tf_available, is_torch_available
+
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFPreTrainedModel
+    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if '_range' in key or '_std' in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
+
+class TFCommonTestCases:
+
+    class TFCommonModelTester(unittest.TestCase):
+
+        model_tester = None
+        all_model_classes = ()
+        test_torchscript = True
+        test_pruning = True
+        test_resize_embeddings = True
+
+        def test_initialization(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # configs_no_init = _config_zero_init(config)
+            # for model_class in self.all_model_classes:
+            #     model = model_class(config=configs_no_init)
+            #     for name, param in model.named_parameters():
+            #         if param.requires_grad:
+            #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
+            #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+
+
+        def test_pt_tf_model_equivalence(self):
+            if not is_torch_available():
+                return
+
+            import transformers
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                tf_model = model_class(config)
+                pt_model = pt_model_class(config)
+
+                tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
+                pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+
+        def test_keyword_and_dict_args(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                outputs_dict = model(inputs_dict)
+
+                inputs_keywords = copy.deepcopy(inputs_dict)
+                input_ids = inputs_keywords.pop('input_ids')
+                outputs_keywords = model(input_ids, **inputs_keywords)
+
+                output_dict = outputs_dict[0].numpy()
+                output_keywords = outputs_keywords[0].numpy()
+
+                self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+        def test_attention_outputs(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                attentions = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                out_len = len(outputs)
+
+                # Check attention is always last and order is fine
+                config.output_attentions = True
+                config.output_hidden_states = True
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, True)
+
+                attentions = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+
+        def test_headmasking(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # config.output_attentions = True
+            # config.output_hidden_states = True
+            # configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            # for model_class in self.all_model_classes:
+            #     model = model_class(config=configs_no_init)
+            #     model.eval()
+
+            #     # Prepare head_mask
+            #     # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+            #     head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+            #     head_mask[0, 0] = 0
+            #     head_mask[-1, :-1] = 0
+            #     head_mask.requires_grad_(requires_grad=True)
+            #     inputs = inputs_dict.copy()
+            #     inputs['head_mask'] = head_mask
+
+            #     outputs = model(**inputs)
+
+            #     # Test that we can get a gradient back for importance score computation
+            #     output = sum(t.sum() for t in outputs[0])
+            #     output = output.sum()
+            #     output.backward()
+            #     multihead_outputs = head_mask.grad
+
+            #     attentions = outputs[-1]
+            #     hidden_states = outputs[-2]
+
+            #     # Remove Nan
+
+            #     self.assertIsNotNone(multihead_outputs)
+            #     self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+            #     self.assertAlmostEqual(
+            #         attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+            #     self.assertAlmostEqual(
+            #         attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+
+        def test_head_pruning(self):
+            pass
+            # if not self.test_pruning:
+            #     return
+
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # for model_class in self.all_model_classes:
+            #     config.output_attentions = True
+            #     config.output_hidden_states = False
+            #     model = model_class(config=config)
+            #     model.eval()
+            #     heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+            #                     -1: [0]}
+            #     model.prune_heads(heads_to_prune)
+            #     outputs = model(**inputs_dict)
+
+            #     attentions = outputs[-1]
+
+            #     self.assertEqual(
+            #         attentions[0].shape[-3], 1)
+            #     self.assertEqual(
+            #         attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            #     self.assertEqual(
+            #         attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+
+        def test_hidden_states_output(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                config.output_hidden_states = True
+                config.output_attentions = False
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                hidden_states = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(model.config.output_attentions, False)
+                self.assertEqual(model.config.output_hidden_states, True)
+                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+
+
+        def test_resize_tokens_embeddings(self):
+            pass
+            # original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # if not self.test_resize_embeddings:
+            #     return
+
+            # for model_class in self.all_model_classes:
+            #     config = copy.deepcopy(original_config)
+            #     model = model_class(config)
+
+            #     model_vocab_size = config.vocab_size
+            #     # Retrieve the embeddings and clone theme
+            #     model_embed = model.resize_token_embeddings(model_vocab_size)
+            #     cloned_embeddings = model_embed.weight.clone()
+
+            #     # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            #     model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            #     self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            #     # Check that it actually resizes the embeddings matrix
+            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+            #     # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            #     model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            #     self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            #     # Check that it actually resizes the embeddings matrix
+            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            #     # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            #     models_equal = True
+            #     for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+            #         if p1.data.ne(p2.data).sum() > 0:
+            #             models_equal = False
+
+            #     self.assertTrue(models_equal)
+
+
+        def test_tie_model_weights(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # def check_same_values(layer_1, layer_2):
+            #     equal = True
+            #     for p1, p2 in zip(layer_1.weight, layer_2.weight):
+            #         if p1.data.ne(p2.data).sum() > 0:
+            #             equal = False
+            #     return equal
+
+            # for model_class in self.all_model_classes:
+            #     if not hasattr(model_class, 'tie_weights'):
+            #         continue
+
+            #     config.torchscript = True
+            #     model_not_tied = model_class(config)
+            #     params_not_tied = list(model_not_tied.parameters())
+
+            #     config_tied = copy.deepcopy(config)
+            #     config_tied.torchscript = False
+            #     model_tied = model_class(config_tied)
+            #     params_tied = list(model_tied.parameters())
+
+            #     # Check that the embedding layer and decoding layer are the same in size and in value
+            #     self.assertGreater(len(params_not_tied), len(params_tied))
+
+            #     # Check that after resize they remain tied.
+            #     model_tied.resize_token_embeddings(config.vocab_size + 10)
+            #     params_tied_2 = list(model_tied.parameters())
+            #     self.assertGreater(len(params_not_tied), len(params_tied))
+            #     self.assertEqual(len(params_tied_2), len(params_tied))
+
+        def test_determinism(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
+                self.assertTrue(tf.math.equal(first, second).numpy().all())
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    output = tf.constant(values,
+                         shape=shape,
+                         dtype=dtype if dtype is not None else tf.int32)
+
+    return output
+
+
+class TFModelUtilsTest(unittest.TestCase):
+    @pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
+    def test_model_from_pretrained(self):
+        pass
+        # logging.basicConfig(level=logging.INFO)
+        # for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        #     config = BertConfig.from_pretrained(model_name)
+        #     self.assertIsNotNone(config)
+        #     self.assertIsInstance(config, PretrainedConfig)
+
+        #     model = BertModel.from_pretrained(model_name)
+        #     model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+        #     self.assertIsNotNone(model)
+        #     self.assertIsInstance(model, PreTrainedModel)
+        #     for value in loading_info.values():
+        #         self.assertEqual(len(value), 0)
+
+        #     config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+        #     model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+        #     self.assertEqual(model.config.output_attentions, True)
+        #     self.assertEqual(model.config.output_hidden_states, True)
+        #     self.assertEqual(model.config, config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
new file mode 100644
index 00000000000..e6d37959144
--- /dev/null
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import pytest
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import DistilBertConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
+                                                             TFDistilBertForMaskedLM,
+                                                             TFDistilBertForQuestionAnswering,
+                                                             TFDistilBertForSequenceClassification)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
+                         TFDistilBertForSequenceClassification) if is_tf_available() else None
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = True
+
+    class TFDistilBertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=False,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = DistilBertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                hidden_dim=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+
+            outputs = model(inputs)
+            sequence_output = outputs[0]
+
+            inputs = [input_ids, input_mask]
+
+            (sequence_output,) = model(inputs)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            (prediction_scores,) = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFDistilBertForQuestionAnswering(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+
+        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFDistilBertForSequenceClassification(config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask}
+            (logits,) = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFDistilBertModelTest.TFDistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+
+    # @pytest.mark.slow
+    # def test_model_from_pretrained(self):
+    #     cache_dir = "/tmp/transformers_test/"
+    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         shutil.rmtree(cache_dir)
+    #         self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
new file mode 100644
index 00000000000..658456d15bf
--- /dev/null
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import GPT2Config, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
+                                                       TFGPT2DoubleHeadsModel,
+                                                       TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
+                         TFGPT2DoubleHeadsModel) if is_tf_available() else ()
+    # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else ()
+
+    class TFGPT2ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     use_mc_token_ids=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2Model(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+
+            inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+            sequence_output = model(inputs)[0]
+
+            sequence_output = model(input_ids)[0]
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFGPT2LMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores = model(inputs)[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+            model = TFGPT2DoubleHeadsModel(config=config)
+
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {
+                "lm_logits": lm_logits.numpy(),
+                "mc_logits": mc_logits.numpy()
+            }
+            self.parent.assertListEqual(
+                list(result["lm_logits"].shape),
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].shape),
+                [self.batch_size, self.num_choices])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+
+    def test_gpt2_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
+
+    def test_gpt2_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
new file mode 100644
index 00000000000..d470c8862da
--- /dev/null
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import OpenAIGPTConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+                                                         TFOpenAIGPTDoubleHeadsModel,
+                                                         TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+                         TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else ()
+
+    class TFOpenAIGPTModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     use_mc_token_ids=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFOpenAIGPTModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+
+            inputs = [input_ids, input_mask]
+            sequence_output = model(inputs)[0]
+
+            sequence_output = model(input_ids)[0]
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+            model = TFOpenAIGPTLMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores = model(inputs)[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
+            model = TFOpenAIGPTDoubleHeadsModel(config=config)
+
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            lm_logits, mc_logits = model(inputs)[:2]
+            result = {
+                "lm_logits": lm_logits.numpy(),
+                "mc_logits": mc_logits.numpy()
+            }
+            self.parent.assertListEqual(
+                list(result["lm_logits"].shape),
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].shape),
+                [self.batch_size, self.num_choices])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFOpenAIGPTModelTest.TFOpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+
+    def test_openai_gpt_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_lm_head(*config_and_inputs)
+
+    def test_openai_gpt_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
new file mode 100644
index 00000000000..735c9aae27a
--- /dev/null
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import RobertaConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    import numpy
+    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
+                                                          TFRobertaForSequenceClassification,
+                                                          TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
+                         TFRobertaForSequenceClassification) if is_tf_available() else ()
+
+    class TFRobertaModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = RobertaConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                           token_labels, choice_labels):
+            model = TFRobertaModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output = model(inputs)[0]
+
+            inputs = [input_ids, input_mask]
+            sequence_output = model(inputs)[0]
+
+            sequence_output = model(input_ids)[0]
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                                   token_labels, choice_labels):
+            model = TFRobertaForMaskedLM(config=config)
+            prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFRobertaModelTest.TFRobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_roberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+
+class TFRobertaModelIntegrationTest(unittest.TestCase):
+
+    @pytest.mark.slow
+    def test_inference_masked_lm(self):
+        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
+        
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 11, 50265]
+        self.assertEqual(
+            list(output.numpy().shape),
+            expected_shape
+        )
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[33.8843, -4.3107, 22.7779],
+              [ 4.6533, -2.8099, 13.6252],
+              [ 1.8222, -3.6898,  8.8600]]]
+        )
+        self.assertTrue(
+            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_no_head(self):
+        model = TFRobertaModel.from_pretrained('roberta-base')
+        
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[-0.0231,  0.0782,  0.0074],
+              [-0.1854,  0.0539, -0.0174],
+              [ 0.0548,  0.0799,  0.1687]]]
+        )
+        self.assertTrue(
+            numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_classification_head(self):
+        model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        
+        input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 3]
+        self.assertEqual(
+            list(output.numpy().shape),
+            expected_shape
+        )
+        expected_tensor = tf.constant([[-0.9469,  0.3913,  0.5118]])
+        self.assertTrue(
+            numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
new file mode 100644
index 00000000000..534fe396468
--- /dev/null
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import random
+import shutil
+import pytest
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import TransfoXLConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
+                                                             TFTransfoXLLMHeadModel,
+                                                             TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+
+    class TFTransfoXLModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     d_embed=32,
+                     num_attention_heads=4,
+                     d_head=8,
+                     d_inner=128,
+                     div_val=2,
+                     num_hidden_layers=5,
+                     scope=None,
+                     seed=1,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.d_embed = d_embed
+            self.num_attention_heads = num_attention_heads
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.num_hidden_layers = num_hidden_layers
+            self.scope = scope
+            self.seed = seed
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            config = TransfoXLConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                cutoffs=self.cutoffs,
+                d_model=self.hidden_size,
+                d_embed=self.d_embed,
+                n_head=self.num_attention_heads,
+                d_head=self.d_head,
+                d_inner=self.d_inner,
+                div_val=self.div_val,
+                n_layer=self.num_hidden_layers)
+
+            return (config, input_ids_1, input_ids_2, lm_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            tf.random.set_seed(self.seed)
+
+        def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TFTransfoXLModel(config)
+
+            hidden_states_1, mems_1 = model(input_ids_1)
+
+            inputs = {'input_ids': input_ids_2,
+                      'mems': mems_1}
+
+            hidden_states_2, mems_2 = model(inputs)
+
+            result = {
+                "hidden_states_1": hidden_states_1.numpy(),
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "hidden_states_2": hidden_states_2.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+            }
+
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+
+        def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TFTransfoXLLMHeadModel(config)
+
+            lm_logits_1, mems_1 = model(input_ids_1)
+
+            inputs = {'input_ids': input_ids_1,
+                      'labels': lm_labels}
+            _, mems_1 = model(inputs)
+
+            lm_logits_2, mems_2 = model([input_ids_2, mems_1])
+
+            inputs = {'input_ids': input_ids_1,
+                      'mems': mems_1,
+                      'labels': lm_labels}
+
+            _, mems_2 = model(inputs)
+
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "lm_logits_1": lm_logits_1.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+                "lm_logits_2": lm_logits_2.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+
+
+    def setUp(self):
+        self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_model(*config_and_inputs)
+
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
new file mode 100644
index 00000000000..1bd661bebf6
--- /dev/null
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from transformers import is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import (XLMConfig, TFXLMModel,
+                                      TFXLMWithLMHeadModel,
+                                      TFXLMForSequenceClassification,
+                                      TFXLMForQuestionAnsweringSimple,
+                                      TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
+                         TFXLMForSequenceClassification,
+                         TFXLMForQuestionAnsweringSimple) if is_tf_available() else ()
+
+
+    class TFXLMModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_lengths=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     gelu_activation=True,
+                     sinusoidal_embeddings=False,
+                     causal=False,
+                     asm=False,
+                     n_langs=2,
+                     vocab_size=99,
+                     n_special=0,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     summary_type="last",
+                     use_proj=True,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_lengths = use_input_lengths
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.asm = asm
+            self.n_langs = n_langs
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.summary_type = summary_type
+            self.causal = causal
+            self.use_proj = use_proj
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.n_langs = n_langs
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.summary_type = summary_type
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+            input_lengths = None
+            if self.use_input_lengths:
+                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+            sequence_labels = None
+            token_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+
+            config = XLMConfig(
+                 vocab_size_or_config_json_file=self.vocab_size,
+                 n_special=self.n_special,
+                 emb_dim=self.hidden_size,
+                 n_layers=self.num_hidden_layers,
+                 n_heads=self.num_attention_heads,
+                 dropout=self.hidden_dropout_prob,
+                 attention_dropout=self.attention_probs_dropout_prob,
+                 gelu_activation=self.gelu_activation,
+                 sinusoidal_embeddings=self.sinusoidal_embeddings,
+                 asm=self.asm,
+                 causal=self.causal,
+                 n_langs=self.n_langs,
+                 max_position_embeddings=self.max_position_embeddings,
+                 initializer_range=self.initializer_range,
+                 summary_type=self.summary_type,
+                 use_proj=self.use_proj)
+
+            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+
+        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths,
+                      'langs': token_type_ids}
+            outputs = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            outputs = model(inputs)
+            sequence_output = outputs[0]
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMWithLMHeadModel(config)
+
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths,
+                      'langs': token_type_ids}
+            outputs = model(inputs)
+
+            logits = outputs[0]
+
+            result = {
+                "logits": logits.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMForQuestionAnsweringSimple(config)
+
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths}
+
+            outputs = model(inputs)
+            start_logits, end_logits = model(inputs)
+
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+
+
+        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = TFXLMForSequenceClassification(config)
+
+            inputs = {'input_ids': input_ids,
+                      'lengths': input_lengths}
+
+            (logits,) = model(inputs)
+
+            result = {
+                "logits": logits.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.type_sequence_label_size])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_lengths,
+             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'langs': token_type_ids, 'lengths': input_lengths}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFXLMModelTest.TFXLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
+
+    def test_xlm_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
+
+    def test_xlm_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
+
+    def test_xlm_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
new file mode 100644
index 00000000000..6a0434938f4
--- /dev/null
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+from transformers import XLNetConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
+                                                        TFXLNetForSequenceClassification,
+                                                        TFXLNetForQuestionAnsweringSimple,
+                                                        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
+                       TFXLNetForSequenceClassification,
+                       TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
+    test_pruning = False
+
+    class TFXLNetModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=10,
+                     clamp_len=-1,
+                     reuse_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     num_attention_heads=4,
+                     d_inner=128,
+                     num_hidden_layers=5,
+                     max_position_embeddings=10,
+                     type_sequence_label_size=2,
+                     untie_r=True,
+                     bi_data=False,
+                     same_length=False,
+                     initializer_range=0.05,
+                     seed=1,
+                     type_vocab_size=2,
+            ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            # self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.reuse_len = reuse_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.num_attention_heads = num_attention_heads
+            self.d_inner = d_inner
+            self.num_hidden_layers = num_hidden_layers
+            self.max_position_embeddings = max_position_embeddings
+            self.bi_data = bi_data
+            self.untie_r = untie_r
+            self.same_length = same_length
+            self.initializer_range = initializer_range
+            self.seed = seed
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+            perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
+            perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
+            perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
+            # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
+            target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
+            target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
+            # target_mapping[:, 0, -1] = 1.0  # predict last token
+
+            sequence_labels = None
+            lm_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+
+            config = XLNetConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                d_model=self.hidden_size,
+                n_head=self.num_attention_heads,
+                d_inner=self.d_inner,
+                n_layer=self.num_hidden_layers,
+                untie_r=self.untie_r,
+                max_position_embeddings=self.max_position_embeddings,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                same_length=self.same_length,
+                reuse_len=self.reuse_len,
+                bi_data=self.bi_data,
+                initializer_range=self.initializer_range,
+                num_labels=self.type_sequence_label_size)
+
+            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            tf.random.set_seed(self.seed)
+
+        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetModel(config)
+
+            inputs = {'input_ids': input_ids_1,
+                      'input_mask': input_mask,
+                      'token_type_ids': segment_ids}
+
+            _, _ = model(inputs)
+
+            inputs = [input_ids_1, input_mask]
+
+            outputs, mems_1 = model(inputs)
+
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "outputs": outputs.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["outputs"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetLMHeadModel(config)
+
+            inputs_1 = {'input_ids': input_ids_1,
+                      'token_type_ids': segment_ids}
+
+            all_logits_1, mems_1 = model(inputs_1)
+
+            inputs_2 = {'input_ids': input_ids_2,
+                        'mems': mems_1,
+                        'token_type_ids': segment_ids}
+
+            all_logits_2, mems_2 = model(inputs_2)
+
+            inputs_3 = {'input_ids': input_ids_q,
+                        'perm_mask': perm_mask,
+                        'target_mapping': target_mapping}
+
+            logits, _ = model(inputs_3)
+
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "all_logits_1": all_logits_1.numpy(),
+                "mems_2": [mem.numpy() for mem in mems_2],
+                "all_logits_2": all_logits_2.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["all_logits_1"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+            self.parent.assertListEqual(
+                list(result["all_logits_2"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetForQuestionAnsweringSimple(config)
+
+            inputs = {'input_ids': input_ids_1,
+                      'attention_mask': input_mask,
+                      'token_type_ids': segment_ids}
+            start_logits, end_logits, mems = model(inputs)
+
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+                "mems": [m.numpy() for m in mems],
+            }
+
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = TFXLNetForSequenceClassification(config)
+
+            logits, mems_1 = model(input_ids_1)
+
+            result = {
+                "mems_1": [mem.numpy() for mem in mems_1],
+                "logits": logits.numpy(),
+            }
+
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(
+                list(list(mem.shape) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels,
+                sequence_labels, is_impossible_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+
+
+    def setUp(self):
+        self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
similarity index 94%
rename from pytorch_transformers/tests/modeling_transfo_xl_test.py
rename to transformers/tests/modeling_transfo_xl_test.py
index f482c472022..f7b913da5b1 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -21,17 +21,21 @@ import random
 import shutil
 import pytest
 
-import torch
+from transformers import is_torch_available
 
-from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+if is_torch_available():
+    import torch
+    from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+    from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
+    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
     test_pruning = False
     test_torchscript = False
     test_resize_embeddings = False
@@ -202,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
similarity index 82%
rename from pytorch_transformers/tests/modeling_xlm_test.py
rename to transformers/tests/modeling_xlm_test.py
index dcd09634770..0133febb581 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -20,8 +20,14 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import is_torch_available
+
+if is_torch_available():
+    from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
+                                      XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
+    from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -29,9 +35,9 @@ from .configuration_common_test import ConfigTester
 
 class XLMModelTest(CommonTestCases.CommonModelTester):
 
-    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
-                         XLMForQuestionAnswering, XLMForSequenceClassification) 
-                         # , XLMForSequenceClassification, XLMForTokenClassification),
+    all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
+                         XLMForSequenceClassification, XLMForQuestionAnsweringSimple) if is_torch_available() else ()
+
 
     class XLMModelTester(object):
 
@@ -174,12 +180,36 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.vocab_size])
 
 
+        def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForQuestionAnsweringSimple(config)
+            model.eval()
+
+            outputs = model(input_ids)
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                       end_positions=sequence_labels)
+            loss, start_logits, end_logits = outputs
+
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+
         def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForQuestionAnswering(config)
             model.eval()
 
             outputs = model(input_ids)
-            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
 
             outputs = model(input_ids, start_positions=sequence_labels,
                                          end_positions=sequence_labels,
@@ -266,24 +296,25 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_model(*config_and_inputs)
 
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
+    def test_xlm_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
 
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+    def test_xlm_simple_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs)
 
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_question_answering(*config_and_inputs)
+    def test_xlm_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
 
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_sequence_classification(*config_and_inputs)
-
-        # config_and_inputs = tester.prepare_config_and_inputs()
-        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+    def test_xlm_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
similarity index 96%
rename from pytorch_transformers/tests/modeling_xlnet_test.py
rename to transformers/tests/modeling_xlnet_test.py
index 4445bc17ac4..10cbdaf37ba 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -23,10 +23,15 @@ import random
 import shutil
 import pytest
 
-import torch
+from transformers import is_torch_available
 
-from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+if is_torch_available():
+    import torch
+
+    from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -34,7 +39,7 @@ from .configuration_common_test import ConfigTester
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes=(XLNetModel, XLNetLMHeadModel,
-                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
+                    XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
     test_pruning = False
 
     class XLNetModelTester(object):
@@ -312,7 +317,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
similarity index 92%
rename from pytorch_transformers/tests/optimization_test.py
rename to transformers/tests/optimization_test.py
index 01465415827..84dbaca52a9 100644
--- a/pytorch_transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -18,11 +18,17 @@ from __future__ import print_function
 
 import unittest
 import os
+import pytest
 
-import torch
+from transformers import is_torch_available
 
-from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
-                                  WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+if is_torch_available():
+    import torch
+
+    from transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+                                    WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 
 from .tokenization_tests_commons import TemporaryDirectory
 
@@ -71,8 +77,8 @@ class OptimizationTest(unittest.TestCase):
 
 
 class ScheduleInitTest(unittest.TestCase):
-    m = torch.nn.Linear(50, 50)
-    optimizer = AdamW(m.parameters(), lr=10.)
+    m = torch.nn.Linear(50, 50) if is_torch_available() else None
+    optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
     num_steps = 10
 
     def assertListAlmostEqual(self, list1, list2, tol):
diff --git a/pytorch_transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
similarity index 77%
rename from pytorch_transformers/tests/tokenization_auto_test.py
rename to transformers/tests/tokenization_auto_test.py
index f4f82083f21..0f49ec75fb2 100644
--- a/pytorch_transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -21,21 +21,20 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class AutoTokenizerTest(unittest.TestCase):
     def test_tokenizer_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
             self.assertIsInstance(tokenizer, BertTokenizer)
             self.assertGreater(len(tokenizer), 0)
 
-        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
             self.assertIsInstance(tokenizer, GPT2Tokenizer)
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
similarity index 96%
rename from pytorch_transformers/tests/tokenization_bert_test.py
rename to transformers/tests/tokenization_bert_test.py
index 1111683ecc5..b70941f8848 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_bert import (BasicTokenizer,
+from transformers.tokenization_bert import (BasicTokenizer,
                                                     BertTokenizer,
                                                     WordpieceTokenizer,
                                                     _is_control, _is_punctuation,
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
 
         assert encoded_sentence == [101] + text + [102]
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
diff --git a/pytorch_transformers/tests/tokenization_dilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
similarity index 77%
rename from pytorch_transformers/tests/tokenization_dilbert_test.py
rename to transformers/tests/tokenization_distilbert_test.py
index 42f80609981..64a88df99ff 100644
--- a/pytorch_transformers/tests/tokenization_dilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
+from transformers.tokenization_distilbert import (DistilBertTokenizer)
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
@@ -36,11 +36,13 @@ class DistilBertTokenizationTest(BertTokenizationTest):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
+               text_2 + [tokenizer.sep_token_id]
 
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
similarity index 93%
rename from pytorch_transformers/tests/tokenization_gpt2_test.py
rename to transformers/tests/tokenization_gpt2_test.py
index 8ee9cb0b542..a77cc75ec2e 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -19,7 +19,7 @@ import unittest
 import json
 from io import open
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
@@ -52,14 +52,14 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
-        output_text = u" lower newer"
+        output_text = u"lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower newer"
         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py
similarity index 96%
rename from pytorch_transformers/tests/tokenization_openai_test.py
rename to transformers/tests/tokenization_openai_test.py
index 6b86416d2d6..56aa219ddcb 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/transformers/tests/tokenization_openai_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 import json
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
similarity index 93%
rename from pytorch_transformers/tests/tokenization_roberta_test.py
rename to transformers/tests/tokenization_roberta_test.py
index 8add2529a54..f14b26a2e4c 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -19,7 +19,7 @@ import json
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
 
 
@@ -51,14 +51,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
-        output_text = u" lower newer"
+        output_text = u"lower newer"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower newer"
         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
 
         assert encoded_sentence == encoded_text_from_decode
         assert encoded_pair == encoded_pair_from_decode
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
similarity index 62%
rename from pytorch_transformers/tests/tokenization_tests_commons.py
rename to transformers/tests/tokenization_tests_commons.py
index 3da0494ac44..b71ba44436a 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -186,3 +186,92 @@ class CommonTestCases:
 
             for weights_list_2 in weights_lists_2:
                 self.assertListEqual(weights_list, weights_list_2)
+
+        def test_mask_output(self):
+            if sys.version_info <= (3, 0):
+                return
+
+            tokenizer = self.get_tokenizer()
+
+            if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
+                sequences, mask = information["input_ids"], information["token_type_ids"]
+                assert len(sequences) == len(mask)
+
+        def test_number_of_added_tokens(self):
+            tokenizer = self.get_tokenizer()
+
+            seq_0 = "Test this method."
+            seq_1 = "With these inputs."
+
+            sequences = tokenizer.encode(seq_0, seq_1)
+            attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+
+            # Method is implemented (e.g. not GPT-2)
+            if len(attached_sequences) != 2:
+                assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
+
+        def test_maximum_encoding_length_single_input(self):
+            tokenizer = self.get_tokenizer()
+
+            seq_0 = "This is a sentence to be encoded."
+            stride = 2
+
+            sequence = tokenizer.encode(seq_0)
+            num_added_tokens = tokenizer.num_added_tokens()
+            total_length = len(sequence) + num_added_tokens
+            information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
+
+            truncated_sequence = information["input_ids"]
+            overflowing_tokens = information["overflowing_tokens"]
+
+            assert len(overflowing_tokens) == 2 + stride
+            assert overflowing_tokens == sequence[-(2 + stride):]
+            assert len(truncated_sequence) == total_length - 2
+            assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
+
+        def test_maximum_encoding_length_pair_input(self):
+            tokenizer = self.get_tokenizer()
+
+            seq_0 = "This is a sentence to be encoded."
+            seq_1 = "This is another sentence to be encoded."
+            stride = 2
+
+            sequence_0_no_special_tokens = tokenizer.encode(seq_0)
+            sequence_1_no_special_tokens = tokenizer.encode(seq_1)
+
+            sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+            truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
+                tokenizer.encode(seq_0),
+                tokenizer.encode(seq_1)[:-2]
+            )
+
+            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
+                                                stride=stride, truncate_first_sequence=False)
+            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
+                                                                add_special_tokens=True, stride=stride,
+                                                                truncate_first_sequence=True)
+
+            truncated_sequence = information["input_ids"]
+            overflowing_tokens = information["overflowing_tokens"]
+            overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
+
+            assert len(overflowing_tokens) == 2 + stride
+            assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
+            assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
+            assert len(truncated_sequence) == len(sequence) - 2
+            assert truncated_sequence == truncated_second_sequence
+
+        def test_encode_input_type(self):
+            tokenizer = self.get_tokenizer()
+
+            sequence = "Let's encode this sequence"
+
+            tokens = tokenizer.tokenize(sequence)
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
+
+            assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
+            assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
similarity index 84%
rename from pytorch_transformers/tests/tokenization_transfo_xl_test.py
rename to transformers/tests/tokenization_transfo_xl_test.py
index f881cf1d2b4..4e99484b0cc 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -16,15 +16,22 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
+import pytest
 from io import open
 
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+from transformers import is_torch_available
 
-from.tokenization_tests_commons import CommonTestCases
+if is_torch_available():
+    import torch
+    from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+else:
+    pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
+
+from .tokenization_tests_commons import CommonTestCases
 
 class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
-    tokenizer_class = TransfoXLTokenizer
+    tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
 
     def setUp(self):
         super(TransfoXLTokenizationTest, self).setUp()
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
similarity index 93%
rename from pytorch_transformers/tests/tokenization_utils_test.py
rename to transformers/tests/tokenization_utils_test.py
index 26ec2d7a394..cf55982c8f2 100644
--- a/pytorch_transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 import unittest
 import six
 
-from pytorch_transformers import PreTrainedTokenizer
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from transformers import PreTrainedTokenizer
+from transformers.tokenization_gpt2 import GPT2Tokenizer
 
 class TokenizerUtilsTest(unittest.TestCase):
     def check_tokenizer_from_pretrained(self, tokenizer_class):
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
similarity index 94%
rename from pytorch_transformers/tests/tokenization_xlm_test.py
rename to transformers/tests/tokenization_xlm_test.py
index 43f1e0c5dd7..b1a71ede59b 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 import json
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
 
         assert encoded_sentence == [1] + text + [1]
         assert encoded_pair == [1] + text + [1] + text_2 + [1]
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
similarity index 96%
rename from pytorch_transformers/tests/tokenization_xlnet_test.py
rename to transformers/tests/tokenization_xlnet_test.py
index c603ce55f9d..f4418c7fe50 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from .tokenization_tests_commons import CommonTestCases
 
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
-        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
+        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
 
         assert encoded_sentence == text + [4, 3]
         assert encoded_pair == text + [4] + text_2 + [4, 3]
diff --git a/pytorch_transformers/tokenization_auto.py b/transformers/tokenization_auto.py
similarity index 96%
rename from pytorch_transformers/tokenization_auto.py
rename to transformers/tokenization_auto.py
index 889774b36c9..504727dcc8e 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -30,7 +30,7 @@ from .tokenization_distilbert import DistilBertTokenizer
 logger = logging.getLogger(__name__)
 
 class AutoTokenizer(object):
-    r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
+    r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
         that will be instantiated as one of the tokenizer classes of the library
         when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -75,7 +75,7 @@ class AutoTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
             cache_dir: (`optional`) string:
@@ -90,7 +90,7 @@ class AutoTokenizer(object):
 
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
         Examples::
 
diff --git a/pytorch_transformers/tokenization_bert.py b/transformers/tokenization_bert.py
similarity index 96%
rename from pytorch_transformers/tokenization_bert.py
rename to transformers/tokenization_bert.py
index b85a4ccf9c3..42163cb8ec5 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -103,7 +103,7 @@ def whitespace_tokenize(text):
 class BertTokenizer(PreTrainedTokenizer):
     r"""
     Constructs a BertTokenizer.
-    :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
@@ -187,22 +187,35 @@ class BertTokenizer(PreTrainedTokenizer):
         out_string = ' '.join(tokens).replace(' ##', '').strip()
         return out_string
 
-    def add_special_tokens_single_sentence(self, token_ids):
+    def add_special_tokens_single_sequence(self, token_ids):
         """
         Adds special tokens to the a sequence for sequence classification tasks.
         A BERT sequence has the following format: [CLS] X [SEP]
         """
         return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
-    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
+
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
diff --git a/pytorch_transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
similarity index 93%
rename from pytorch_transformers/tokenization_distilbert.py
rename to transformers/tokenization_distilbert.py
index 5a6d02f98df..dfa02926d82 100644
--- a/pytorch_transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -45,7 +45,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class DistilBertTokenizer(BertTokenizer):
     r"""
     Constructs a DistilBertTokenizer.
-    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
diff --git a/pytorch_transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
similarity index 95%
rename from pytorch_transformers/tokenization_gpt2.py
rename to transformers/tokenization_gpt2.py
index 4ebe1ad5751..3d5d3029dd9 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def _tokenize(self, text):
-        """ Tokenize a string. """
-        text = ' ' + text  # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
+    def _tokenize(self, text, add_prefix_space=False):
+        """ Tokenize a string.
+            Args:
+                - add_prefix_space (boolean, default False):
+                    Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
+        """
+        if add_prefix_space:
+            text = ' ' + text
+
         bpe_tokens = []
         for token in re.findall(self.pat, text):
             if sys.version_info[0] == 2:
diff --git a/pytorch_transformers/tokenization_openai.py b/transformers/tokenization_openai.py
similarity index 100%
rename from pytorch_transformers/tokenization_openai.py
rename to transformers/tokenization_openai.py
diff --git a/pytorch_transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
similarity index 86%
rename from pytorch_transformers/tokenization_roberta.py
rename to transformers/tokenization_roberta.py
index 67808752d51..ee8e97d6bfa 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -81,14 +81,14 @@ class RobertaTokenizer(GPT2Tokenizer):
                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                                mask_token=mask_token, **kwargs)
 
-    def add_special_tokens_single_sentence(self, token_ids):
+    def add_special_tokens_single_sequence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
         A RoBERTa sequence has the following format: <s> X </s>
         """
         return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
-    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
@@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer):
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A RoBERTa sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
\ No newline at end of file
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
similarity index 99%
rename from pytorch_transformers/tokenization_transfo_xl.py
rename to transformers/tokenization_transfo_xl.py
index 66bc01c1bb0..8d5a0ce9d4e 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/transformers/tokenization_transfo_xl.py
@@ -26,16 +26,20 @@ import sys
 from collections import Counter, OrderedDict
 from io import open
 
-import torch
 import numpy as np
 
 from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer
 
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
+try:
+    import torch
+except ImportError:
+    pass
+
+# if sys.version_info[0] == 2:
+#     import cPickle as pickle
+# else:
+#     import pickle
 
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/tokenization_utils.py b/transformers/tokenization_utils.py
similarity index 76%
rename from pytorch_transformers/tokenization_utils.py
rename to transformers/tokenization_utils.py
index 1e2cd59648d..e8ffff3cb9f 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -23,7 +23,12 @@ import six
 import copy
 from io import open
 
-from .file_utils import cached_path
+from .file_utils import cached_path, is_tf_available, is_torch_available
+
+if is_tf_available():
+    import tensorflow as tf
+if is_torch_available():
+    import torch
 
 logger = logging.getLogger(__name__)
 
@@ -231,13 +236,13 @@ class PreTrainedTokenizer(object):
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         r"""
-        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
 
         Args:
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
             cache_dir: (`optional`) string:
@@ -252,7 +257,7 @@ class PreTrainedTokenizer(object):
 
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
         Examples::
 
@@ -427,7 +432,7 @@ class PreTrainedTokenizer(object):
             This won't save modifications other than (added tokens and special token mapping) you may have
             applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
 
-            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         if not os.path.isdir(save_directory):
             logger.error("Saving directory ({}) should be a directory".format(save_directory))
@@ -464,7 +469,7 @@ class PreTrainedTokenizer(object):
         """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
             and special token mappings.
 
-            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         raise NotImplementedError
 
@@ -518,6 +523,30 @@ class PreTrainedTokenizer(object):
 
         return len(to_add_tokens)
 
+    def num_added_tokens(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+            inside your training loop.
+
+        Args:
+            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False.
+
+        Returns:
+            Number of tokens added to sequences
+        """
+
+        if pair:
+            initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
+            final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
+        else:
+            initial_tokens_len = len(self.encode("This is a sequence"))
+            final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
+
+        return final_tokens_len - initial_tokens_len
 
     def add_special_tokens(self, special_tokens_dict):
         """
@@ -663,38 +692,185 @@ class PreTrainedTokenizer(object):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
+    def encode(self,
+                text,
+                text_pair=None,
+                add_special_tokens=False,
+                max_length=None,
+                stride=0,
+                truncate_first_sequence=True,
+                return_tensors=None,
+                **kwargs):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-        
+
         Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
 
         Args:
-            text: The first sequence to be encoded.
-            text_pair: Optional second sequence to be encoded.
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                 to their model.
+            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defined the number of additional tokens.
+            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
+                will be truncated.
+            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+                or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
         """
-        if text_pair is None:
-            if add_special_tokens:
-                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
-            else:
-                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+        encoded_inputs = self.encode_plus(text,
+                                          text_pair=text_pair,
+                                          max_length=max_length,
+                                          add_special_tokens=add_special_tokens,
+                                          stride=stride,
+                                          truncate_first_sequence=truncate_first_sequence,
+                                          return_tensors=return_tensors,
+                                          **kwargs)
 
-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
+        return encoded_inputs["input_ids"]
+
+    def encode_plus(self,
+                    text,
+                    text_pair=None,
+                    add_special_tokens=False,
+                    max_length=None,
+                    stride=0,
+                    truncate_first_sequence=True,
+                    return_tensors=None,
+                    **kwargs):
+        """
+        Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
+        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+        Args:
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defined the number of additional tokens.
+            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
+                will be truncated.
+            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+                or PyTorch torch.Tensor instead of a list of python integers.
+            **kwargs: passed to the `self.tokenize()` method
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, six.string_types):
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+        return self.prepare_for_model(first_ids,
+                                      pair_ids=second_ids,
+                                      max_length=max_length,
+                                      add_special_tokens=add_special_tokens,
+                                      stride=stride,
+                                      truncate_first_sequence=truncate_first_sequence,
+                                      return_tensors=return_tensors)
+
+
+    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
+                          truncate_first_sequence=True, return_tensors=None):
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        It adds special tokens, truncates
+        sequences if overflowing while taking into account the special tokens and manages a window stride for
+        overflowing tokens
+
+        Args:
+            ids: list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+                list of inputs.
+            truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
+                alongside a specified `max_length`, will truncate the first sequence if the total size is superior
+                than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
+            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+                or PyTorch torch.Tensor instead of a list of python integers.
+
+        Return:
+            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
+        """
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        encoded_inputs = {}
+        if max_length:
+            n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
+            if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
+                logger.warning(
+                    "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
+                    "This pair of sequences will not be truncated.")
+            else:
+                if n_added_tokens + len_ids + len_pair_ids > max_length:
+                    if truncate_first_sequence or not pair:
+                        encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
+                        ids = ids[:max_length - len_pair_ids - n_added_tokens]
+                    elif not truncate_first_sequence and pair:
+                        encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
+                        pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
+                    else:
+                        logger.warning(
+                            "Cannot truncate second sequence as it is not provided. No truncation.")
 
         if add_special_tokens:
-            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
+            sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
         else:
-            return first_sentence_tokens, second_sentence_tokens
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
 
-    def add_special_tokens_single_sentence(self, token_ids):
+        if return_tensors == 'tf' and is_tf_available():
+            sequence = tf.constant([sequence])
+            token_type_ids = tf.constant([token_type_ids])
+        elif return_tensors == 'pt' and is_torch_available():
+            sequence = torch.tensor([sequence])
+            token_type_ids = torch.tensor([token_type_ids])
+        elif return_tensors is not None:
+            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
+
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["token_type_ids"] = token_type_ids
+
+        return encoded_inputs
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+        logger.warning("This tokenizer does not make use of special tokens.")
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+
+    def add_special_tokens_single_sequence(self, token_ids):
         logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
         return token_ids
 
-    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
         logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
         return token_ids_0 + token_ids_1
 
@@ -740,7 +916,7 @@ class PreTrainedTokenizer(object):
 
         # To avoid mixing byte-level and unicode for byte-level BPT
         # we need to build string separatly for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/pytorch-transformers/issues/1133
+        # cf. https://github.com/huggingface/transformers/issues/1133
         sub_texts = []
         current_sub_text = []
         for token in filtered_tokens:
diff --git a/pytorch_transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
similarity index 98%
rename from pytorch_transformers/tokenization_xlm.py
rename to transformers/tokenization_xlm.py
index f7231384b31..f1e49416a43 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -754,14 +754,14 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
-    def add_special_tokens_single_sentence(self, token_ids):
+    def add_special_tokens_single_sequence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
         An XLM sequence has the following format: [CLS] X [SEP]
         """
         return [self.cls_token_id] + token_ids + [self.sep_token_id]
 
-    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
@@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        An XLM sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
         if not os.path.isdir(save_directory):
diff --git a/pytorch_transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
similarity index 91%
rename from pytorch_transformers/tokenization_xlnet.py
rename to transformers/tokenization_xlnet.py
index 0f19d76ae64..ad9efdf0436 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -181,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
-    def add_special_tokens_single_sentence(self, token_ids):
+    def add_special_tokens_single_sequence(self, token_ids):
         """
         Adds special tokens to a sequence for sequence classification tasks.
         An XLNet sequence has the following format: X [SEP][CLS]
@@ -190,15 +190,29 @@ class XLNetTokenizer(PreTrainedTokenizer):
         cls = [self.cls_token_id]
         return token_ids + sep + cls
 
-    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+    def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
         """
         Adds special tokens to a sequence pair for sequence classification tasks.
         An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
         """
+
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
+        | first sequence    | second sequence     | CLS segment ID
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        cls_segment_id = [2]
+
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
             to a directory.