diff --git a/.gitignore b/.gitignore
index 9f6b5e79f44..53a6dac0c58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@ __pycache__/
 # C extensions
 *.so
 
+# tests and logs
+tests/fixtures
+logs/
+
 # Distribution / packaging
 .Python
 build/
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c5822619c9e..7bc19ac9344 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -352,173 +352,193 @@ if is_torch_available():
 # TensorFlow
 if is_tf_available():
     from .modeling_tf_utils import (
-        TFPreTrainedModel,
-        TFSharedEmbeddings,
-        TFSequenceSummary,
         shape_list,
         tf_top_k_top_p_filtering,
+        TFPreTrainedModel,
+        TFSequenceSummary,
+        TFSharedEmbeddings,
     )
     from .modeling_tf_auto import (
-        TFAutoModel,
-        TFAutoModelForPreTraining,
-        TFAutoModelForMultipleChoice,
-        TFAutoModelForSequenceClassification,
-        TFAutoModelForQuestionAnswering,
-        TFAutoModelWithLMHead,
-        TFAutoModelForTokenClassification,
         TF_MODEL_MAPPING,
+        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
         TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_WITH_LM_HEAD_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    )
-
-    from .modeling_tf_bert import (
-        TFBertPreTrainedModel,
-        TFBertMainLayer,
-        TFBertEmbeddings,
-        TFBertModel,
-        TFBertForPreTraining,
-        TFBertForMaskedLM,
-        TFBertForNextSentencePrediction,
-        TFBertForSequenceClassification,
-        TFBertForMultipleChoice,
-        TFBertForTokenClassification,
-        TFBertForQuestionAnswering,
-        TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_gpt2 import (
-        TFGPT2PreTrainedModel,
-        TFGPT2MainLayer,
-        TFGPT2Model,
-        TFGPT2LMHeadModel,
-        TFGPT2DoubleHeadsModel,
-        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_openai import (
-        TFOpenAIGPTPreTrainedModel,
-        TFOpenAIGPTMainLayer,
-        TFOpenAIGPTModel,
-        TFOpenAIGPTLMHeadModel,
-        TFOpenAIGPTDoubleHeadsModel,
-        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_transfo_xl import (
-        TFTransfoXLPreTrainedModel,
-        TFTransfoXLMainLayer,
-        TFTransfoXLModel,
-        TFTransfoXLLMHeadModel,
-        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFAdaptiveEmbedding,
-    )
-
-    from .modeling_tf_xlnet import (
-        TFXLNetPreTrainedModel,
-        TFXLNetMainLayer,
-        TFXLNetModel,
-        TFXLNetLMHeadModel,
-        TFXLNetForSequenceClassification,
-        TFXLNetForTokenClassification,
-        TFXLNetForQuestionAnsweringSimple,
-        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_xlm import (
-        TFXLMPreTrainedModel,
-        TFXLMMainLayer,
-        TFXLMModel,
-        TFXLMWithLMHeadModel,
-        TFXLMForSequenceClassification,
-        TFXLMForQuestionAnsweringSimple,
-        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_xlm_roberta import (
-        TFXLMRobertaForMaskedLM,
-        TFXLMRobertaModel,
-        TFXLMRobertaForSequenceClassification,
-        TFXLMRobertaForTokenClassification,
-        TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_roberta import (
-        TFRobertaPreTrainedModel,
-        TFRobertaMainLayer,
-        TFRobertaModel,
-        TFRobertaForMaskedLM,
-        TFRobertaForSequenceClassification,
-        TFRobertaForTokenClassification,
-        TFRobertaForQuestionAnswering,
-        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_camembert import (
-        TFCamembertModel,
-        TFCamembertForMaskedLM,
-        TFCamembertForSequenceClassification,
-        TFCamembertForTokenClassification,
-        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_flaubert import (
-        TFFlaubertModel,
-        TFFlaubertWithLMHeadModel,
-        TFFlaubertForSequenceClassification,
-        TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_distilbert import (
-        TFDistilBertPreTrainedModel,
-        TFDistilBertMainLayer,
-        TFDistilBertModel,
-        TFDistilBertForMaskedLM,
-        TFDistilBertForSequenceClassification,
-        TFDistilBertForTokenClassification,
-        TFDistilBertForQuestionAnswering,
-        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-    from .modeling_tf_ctrl import (
-        TFCTRLPreTrainedModel,
-        TFCTRLModel,
-        TFCTRLLMHeadModel,
-        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TF_MODEL_WITH_LM_HEAD_MAPPING,
+        TFAutoModel,
+        TFAutoModelForMultipleChoice,
+        TFAutoModelForPreTraining,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForTokenClassification,
+        TFAutoModelWithLMHead,
     )
 
     from .modeling_tf_albert import (
-        TFAlbertPreTrainedModel,
-        TFAlbertMainLayer,
-        TFAlbertModel,
-        TFAlbertForPreTraining,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         TFAlbertForMaskedLM,
         TFAlbertForMultipleChoice,
-        TFAlbertForSequenceClassification,
+        TFAlbertForPreTraining,
         TFAlbertForQuestionAnswering,
-        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFAlbertForSequenceClassification,
+        TFAlbertForTokenClassification,
+        TFAlbertMainLayer,
+        TFAlbertModel,
+        TFAlbertPreTrainedModel,
     )
 
-    from .modeling_tf_t5 import (
-        TFT5PreTrainedModel,
-        TFT5Model,
-        TFT5ForConditionalGeneration,
-        TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
+    from .modeling_tf_bert import (
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFBertEmbeddings,
+        TFBertForMaskedLM,
+        TFBertForMultipleChoice,
+        TFBertForNextSentencePrediction,
+        TFBertForPreTraining,
+        TFBertForQuestionAnswering,
+        TFBertForSequenceClassification,
+        TFBertForTokenClassification,
+        TFBertMainLayer,
+        TFBertModel,
+        TFBertPreTrainedModel,
+    )
+
+    from .modeling_tf_camembert import (
+        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFCamembertForMaskedLM,
+        TFCamembertModel,
+        TFCamembertForMultipleChoice,
+        TFCamembertForQuestionAnswering,
+        TFCamembertForSequenceClassification,
+        TFCamembertForTokenClassification,
+    )
+
+    from .modeling_tf_ctrl import (
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFCTRLLMHeadModel,
+        TFCTRLModel,
+        TFCTRLPreTrainedModel,
+    )
+
+    from .modeling_tf_distilbert import (
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForMultipleChoice,
+        TFDistilBertForQuestionAnswering,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertMainLayer,
+        TFDistilBertModel,
+        TFDistilBertPreTrainedModel,
     )
 
     from .modeling_tf_electra import (
-        TFElectraPreTrainedModel,
-        TFElectraModel,
-        TFElectraForPreTraining,
-        TFElectraForMaskedLM,
-        TFElectraForTokenClassification,
         TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFElectraForMaskedLM,
+        TFElectraForPreTraining,
+        TFElectraForQuestionAnswering,
+        TFElectraForTokenClassification,
+        TFElectraModel,
+        TFElectraPreTrainedModel,
+    )
+
+    from .modeling_tf_flaubert import (
+        TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFFlaubertForMultipleChoice,
+        TFFlaubertForQuestionAnsweringSimple,
+        TFFlaubertForSequenceClassification,
+        TFFlaubertForTokenClassification,
+        TFFlaubertWithLMHeadModel,
+        TFFlaubertModel,
+    )
+
+    from .modeling_tf_gpt2 import (
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFGPT2DoubleHeadsModel,
+        TFGPT2LMHeadModel,
+        TFGPT2MainLayer,
+        TFGPT2Model,
+        TFGPT2PreTrainedModel,
+    )
+
+    from .modeling_tf_openai import (
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFOpenAIGPTDoubleHeadsModel,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTMainLayer,
+        TFOpenAIGPTModel,
+        TFOpenAIGPTPreTrainedModel,
+    )
+
+    from .modeling_tf_roberta import (
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFRobertaForMaskedLM,
+        TFRobertaForMultipleChoice,
+        TFRobertaForQuestionAnswering,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaPreTrainedModel,
+    )
+
+    from .modeling_tf_t5 import (
+        TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFT5ForConditionalGeneration,
+        TFT5Model,
+        TFT5PreTrainedModel,
+    )
+
+    from .modeling_tf_transfo_xl import (
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFAdaptiveEmbedding,
+        TFTransfoXLLMHeadModel,
+        TFTransfoXLMainLayer,
+        TFTransfoXLModel,
+        TFTransfoXLPreTrainedModel,
+    )
+
+    from .modeling_tf_xlm import (
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFXLMForMultipleChoice,
+        TFXLMForQuestionAnsweringSimple,
+        TFXLMForSequenceClassification,
+        TFXLMForTokenClassification,
+        TFXLMWithLMHeadModel,
+        TFXLMMainLayer,
+        TFXLMModel,
+        TFXLMPreTrainedModel,
+    )
+
+    from .modeling_tf_xlm_roberta import (
+        TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFXLMRobertaForMaskedLM,
+        TFXLMRobertaForMultipleChoice,
+        TFXLMRobertaForQuestionAnswering,
+        TFXLMRobertaForSequenceClassification,
+        TFXLMRobertaForTokenClassification,
+        TFXLMRobertaModel,
+    )
+
+    from .modeling_tf_xlnet import (
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFXLNetForMultipleChoice,
+        TFXLNetForQuestionAnsweringSimple,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetLMHeadModel,
+        TFXLNetMainLayer,
+        TFXLNetModel,
+        TFXLNetPreTrainedModel,
     )
 
     # Optimization
-    from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
+    from .optimization_tf import (
+        AdamWeightDecay,
+        create_optimizer,
+        GradientAccumulator,
+        WarmUp,
+    )
 
     # Trainer
     from .trainer_tf import TFTrainer
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 3b39041fd6b..b717774a614 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -394,8 +394,8 @@ def squad_convert_examples_to_features(
                         "qas_id": ex.qas_id,
                     },
                     {
-                        "start_position": ex.start_position,
-                        "end_position": ex.end_position,
+                        "start_positions": ex.start_position,
+                        "end_positions": ex.end_position,
                         "cls_index": ex.cls_index,
                         "p_mask": ex.p_mask,
                         "is_impossible": ex.is_impossible,
@@ -412,8 +412,8 @@ def squad_convert_examples_to_features(
                 "qas_id": tf.string,
             },
             {
-                "start_position": tf.int64,
-                "end_position": tf.int64,
+                "start_positions": tf.int64,
+                "end_positions": tf.int64,
                 "cls_index": tf.int64,
                 "p_mask": tf.int32,
                 "is_impossible": tf.int32,
@@ -429,8 +429,8 @@ def squad_convert_examples_to_features(
                 "qas_id": tf.TensorShape([]),
             },
             {
-                "start_position": tf.TensorShape([]),
-                "end_position": tf.TensorShape([]),
+                "start_positions": tf.TensorShape([]),
+                "end_positions": tf.TensorShape([]),
                 "cls_index": tf.TensorShape([]),
                 "p_mask": tf.TensorShape([None]),
                 "is_impossible": tf.TensorShape([]),
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index d37951e4bb6..7d3e2d02e51 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -81,6 +81,8 @@ class HfArgumentParser(ArgumentParser):
                 kwargs["type"] = field.type
                 if field.default is not dataclasses.MISSING:
                     kwargs["default"] = field.default
+                elif field.default_factory is not dataclasses.MISSING:
+                    kwargs["default"] = field.default_factory()
                 else:
                     kwargs["required"] = True
             self.add_argument(field_name, **kwargs)
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index e2d33d329d4..752f7149b37 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -23,7 +23,16 @@ import tensorflow as tf
 from .configuration_albert import AlbertConfig
 from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
+from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 from .tokenization_utils import BatchEncoding
 
 
@@ -841,7 +850,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     ALBERT_START_DOCSTRING,
 )
-class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
+class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -852,9 +861,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def call(self, inputs, **kwargs):
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
         logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
@@ -878,27 +903,126 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
         model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        outputs = self.albert(inputs, **kwargs)
+
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
+        pooled_output = self.dropout(pooled_output, training=training)
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # logits, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Albert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ALBERT_START_DOCSTRING,
+)
+class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.albert = TFAlbertMainLayer(config, name="albert")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import AlbertTokenizer, TFAlbertForTokenClassification
+
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+        """
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
     """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
     ALBERT_START_DOCSTRING,
 )
-class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
+class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -908,9 +1032,32 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
         )
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def call(self, inputs, **kwargs):
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
         start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
@@ -938,14 +1085,23 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
 
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
         model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet")
-        start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
 
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
         answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
 
         """
-        outputs = self.albert(inputs, **kwargs)
+        outputs = self.albert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
@@ -956,7 +1112,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
 
         outputs = (start_logits, end_logits,) + outputs[2:]
 
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -964,7 +1126,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     ALBERT_START_DOCSTRING,
 )
-class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
+class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -992,9 +1154,15 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
         position_ids=None,
         head_mask=None,
         inputs_embeds=None,
+        labels=None,
         training=False,
     ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
@@ -1019,12 +1187,13 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
 
         tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
         model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
 
-        example1 = ["This is a context", "Is it a context? Yes"]
-        example2 = ["This is a context", "Is it a context? No"]
-        encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128)
-        outputs = model(encoding["input_ids"][None, :])
-        logits = outputs[0]
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
 
         """
         if isinstance(inputs, (tuple, list)):
@@ -1036,10 +1205,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
         elif isinstance(inputs, dict):
-            print("isdict(1)")
             input_ids = inputs.get("input_ids")
-            print(input_ids)
-
             attention_mask = inputs.get("attention_mask", attention_mask)
             token_type_ids = inputs.get("token_type_ids", token_type_ids)
             position_ids = inputs.get("position_ids", position_ids)
@@ -1080,4 +1246,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
 
         outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # reshaped_logits, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index cd300019d88..23521424fe7 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -22,14 +22,18 @@ from .configuration_auto import (
     AlbertConfig,
     AutoConfig,
     BertConfig,
+    CamembertConfig,
     CTRLConfig,
     DistilBertConfig,
+    ElectraConfig,
+    FlaubertConfig,
     GPT2Config,
     OpenAIGPTConfig,
     RobertaConfig,
     T5Config,
     TransfoXLConfig,
     XLMConfig,
+    XLMRobertaConfig,
     XLNetConfig,
 )
 from .configuration_utils import PretrainedConfig
@@ -39,6 +43,7 @@ from .modeling_tf_albert import (
     TFAlbertForPreTraining,
     TFAlbertForQuestionAnswering,
     TFAlbertForSequenceClassification,
+    TFAlbertForTokenClassification,
     TFAlbertModel,
 )
 from .modeling_tf_bert import (
@@ -50,18 +55,43 @@ from .modeling_tf_bert import (
     TFBertForTokenClassification,
     TFBertModel,
 )
+from .modeling_tf_camembert import (
+    TFCamembertForMaskedLM,
+    TFCamembertForMultipleChoice,
+    TFCamembertForQuestionAnswering,
+    TFCamembertForSequenceClassification,
+    TFCamembertForTokenClassification,
+    TFCamembertModel,
+)
 from .modeling_tf_ctrl import TFCTRLLMHeadModel, TFCTRLModel
 from .modeling_tf_distilbert import (
     TFDistilBertForMaskedLM,
+    TFDistilBertForMultipleChoice,
     TFDistilBertForQuestionAnswering,
     TFDistilBertForSequenceClassification,
     TFDistilBertForTokenClassification,
     TFDistilBertModel,
 )
+from .modeling_tf_electra import (
+    TFElectraForMaskedLM,
+    TFElectraForPreTraining,
+    TFElectraForQuestionAnswering,
+    TFElectraForTokenClassification,
+    TFElectraModel,
+)
+from .modeling_tf_flaubert import (
+    TFFlaubertForMultipleChoice,
+    TFFlaubertForQuestionAnsweringSimple,
+    TFFlaubertForSequenceClassification,
+    TFFlaubertForTokenClassification,
+    TFFlaubertModel,
+    TFFlaubertWithLMHeadModel,
+)
 from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model
 from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
 from .modeling_tf_roberta import (
     TFRobertaForMaskedLM,
+    TFRobertaForMultipleChoice,
     TFRobertaForQuestionAnswering,
     TFRobertaForSequenceClassification,
     TFRobertaForTokenClassification,
@@ -70,12 +100,23 @@ from .modeling_tf_roberta import (
 from .modeling_tf_t5 import TFT5ForConditionalGeneration, TFT5Model
 from .modeling_tf_transfo_xl import TFTransfoXLLMHeadModel, TFTransfoXLModel
 from .modeling_tf_xlm import (
+    TFXLMForMultipleChoice,
     TFXLMForQuestionAnsweringSimple,
     TFXLMForSequenceClassification,
+    TFXLMForTokenClassification,
     TFXLMModel,
     TFXLMWithLMHeadModel,
 )
+from .modeling_tf_xlm_roberta import (
+    TFXLMRobertaForMaskedLM,
+    TFXLMRobertaForMultipleChoice,
+    TFXLMRobertaForQuestionAnswering,
+    TFXLMRobertaForSequenceClassification,
+    TFXLMRobertaForTokenClassification,
+    TFXLMRobertaModel,
+)
 from .modeling_tf_xlnet import (
+    TFXLNetForMultipleChoice,
     TFXLNetForQuestionAnsweringSimple,
     TFXLNetForSequenceClassification,
     TFXLNetForTokenClassification,
@@ -89,83 +130,118 @@ logger = logging.getLogger(__name__)
 
 TF_MODEL_MAPPING = OrderedDict(
     [
-        (T5Config, TFT5Model),
-        (DistilBertConfig, TFDistilBertModel),
         (AlbertConfig, TFAlbertModel),
-        (RobertaConfig, TFRobertaModel),
         (BertConfig, TFBertModel),
-        (OpenAIGPTConfig, TFOpenAIGPTModel),
-        (GPT2Config, TFGPT2Model),
-        (TransfoXLConfig, TFTransfoXLModel),
-        (XLNetConfig, TFXLNetModel),
-        (XLMConfig, TFXLMModel),
+        (CamembertConfig, TFCamembertModel),
         (CTRLConfig, TFCTRLModel),
+        (DistilBertConfig, TFDistilBertModel),
+        (ElectraConfig, TFElectraModel),
+        (FlaubertConfig, TFFlaubertModel),
+        (GPT2Config, TFGPT2Model),
+        (OpenAIGPTConfig, TFOpenAIGPTModel),
+        (RobertaConfig, TFRobertaModel),
+        (T5Config, TFT5Model),
+        (TransfoXLConfig, TFTransfoXLModel),
+        (XLMConfig, TFXLMModel),
+        (XLMRobertaConfig, TFXLMRobertaModel),
+        (XLNetConfig, TFXLNetModel),
     ]
 )
 
 TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
     [
-        (T5Config, TFT5ForConditionalGeneration),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
         (AlbertConfig, TFAlbertForPreTraining),
-        (RobertaConfig, TFRobertaForMaskedLM),
         (BertConfig, TFBertForPreTraining),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLNetConfig, TFXLNetLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
+        (CamembertConfig, TFCamembertForMaskedLM),
         (CTRLConfig, TFCTRLLMHeadModel),
+        (DistilBertConfig, TFDistilBertForMaskedLM),
+        (ElectraConfig, TFElectraForPreTraining),
+        (FlaubertConfig, TFFlaubertWithLMHeadModel),
+        (GPT2Config, TFGPT2LMHeadModel),
+        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (RobertaConfig, TFRobertaForMaskedLM),
+        (T5Config, TFT5ForConditionalGeneration),
+        (TransfoXLConfig, TFTransfoXLLMHeadModel),
+        (XLMConfig, TFXLMWithLMHeadModel),
+        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
+        (XLNetConfig, TFXLNetLMHeadModel),
     ]
 )
 
 TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
     [
-        (T5Config, TFT5ForConditionalGeneration),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
         (AlbertConfig, TFAlbertForMaskedLM),
-        (RobertaConfig, TFRobertaForMaskedLM),
         (BertConfig, TFBertForMaskedLM),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLNetConfig, TFXLNetLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
+        (CamembertConfig, TFCamembertForMaskedLM),
         (CTRLConfig, TFCTRLLMHeadModel),
+        (DistilBertConfig, TFDistilBertForMaskedLM),
+        (ElectraConfig, TFElectraForMaskedLM),
+        (FlaubertConfig, TFFlaubertWithLMHeadModel),
+        (GPT2Config, TFGPT2LMHeadModel),
+        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (RobertaConfig, TFRobertaForMaskedLM),
+        (T5Config, TFT5ForConditionalGeneration),
+        (TransfoXLConfig, TFTransfoXLLMHeadModel),
+        (XLMConfig, TFXLMWithLMHeadModel),
+        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
+        (XLNetConfig, TFXLNetLMHeadModel),
+    ]
+)
+
+TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
+    [
+        (AlbertConfig, TFAlbertForMultipleChoice),
+        (BertConfig, TFBertForMultipleChoice),
+        (CamembertConfig, TFCamembertForMultipleChoice),
+        (DistilBertConfig, TFDistilBertForMultipleChoice),
+        (FlaubertConfig, TFFlaubertForMultipleChoice),
+        (RobertaConfig, TFRobertaForMultipleChoice),
+        (XLMConfig, TFXLMForMultipleChoice),
+        (XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
+        (XLNetConfig, TFXLNetForMultipleChoice),
+    ]
+)
+
+TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
+    [
+        (AlbertConfig, TFAlbertForQuestionAnswering),
+        (BertConfig, TFBertForQuestionAnswering),
+        (CamembertConfig, TFCamembertForQuestionAnswering),
+        (DistilBertConfig, TFDistilBertForQuestionAnswering),
+        (ElectraConfig, TFElectraForQuestionAnswering),
+        (FlaubertConfig, TFFlaubertForQuestionAnsweringSimple),
+        (RobertaConfig, TFRobertaForQuestionAnswering),
+        (XLMConfig, TFXLMForQuestionAnsweringSimple),
+        (XLMRobertaConfig, TFXLMRobertaForQuestionAnswering),
+        (XLNetConfig, TFXLNetForQuestionAnsweringSimple),
     ]
 )
 
 TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
     [
-        (DistilBertConfig, TFDistilBertForSequenceClassification),
         (AlbertConfig, TFAlbertForSequenceClassification),
-        (RobertaConfig, TFRobertaForSequenceClassification),
         (BertConfig, TFBertForSequenceClassification),
-        (XLNetConfig, TFXLNetForSequenceClassification),
+        (CamembertConfig, TFCamembertForSequenceClassification),
+        (DistilBertConfig, TFDistilBertForSequenceClassification),
+        (FlaubertConfig, TFFlaubertForSequenceClassification),
+        (RobertaConfig, TFRobertaForSequenceClassification),
         (XLMConfig, TFXLMForSequenceClassification),
-    ]
-)
-
-TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
-    [(BertConfig, TFBertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice)]
-)
-
-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
-    [
-        (DistilBertConfig, TFDistilBertForQuestionAnswering),
-        (AlbertConfig, TFAlbertForQuestionAnswering),
-        (RobertaConfig, TFRobertaForQuestionAnswering),
-        (BertConfig, TFBertForQuestionAnswering),
-        (XLNetConfig, TFXLNetForQuestionAnsweringSimple),
-        (XLMConfig, TFXLMForQuestionAnsweringSimple),
+        (XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
+        (XLNetConfig, TFXLNetForSequenceClassification),
     ]
 )
 
 TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
     [
-        (DistilBertConfig, TFDistilBertForTokenClassification),
-        (RobertaConfig, TFRobertaForTokenClassification),
+        (AlbertConfig, TFAlbertForTokenClassification),
         (BertConfig, TFBertForTokenClassification),
+        (CamembertConfig, TFCamembertForTokenClassification),
+        (DistilBertConfig, TFDistilBertForTokenClassification),
+        (ElectraConfig, TFElectraForTokenClassification),
+        (FlaubertConfig, TFFlaubertForTokenClassification),
+        (RobertaConfig, TFRobertaForTokenClassification),
+        (XLMConfig, TFXLMForTokenClassification),
+        (XLMRobertaConfig, TFXLMRobertaForTokenClassification),
         (XLNetConfig, TFXLNetForTokenClassification),
     ]
 )
@@ -632,11 +708,13 @@ class TFAutoModelWithLMHead(object):
 
         """
         config = kwargs.pop("config", None)
+
         if not isinstance(config, PretrainedConfig):
             config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
-            if isinstance(config, config_class):
+            # Not using isinstance() here to do not take into account inheritance
+            if config_class == type(config):
                 return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
         raise ValueError(
             "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index 5ec6aeb7d04..001fb7ca203 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -23,7 +23,16 @@ import tensorflow as tf
 
 from .configuration_bert import BertConfig
 from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
+from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 from .tokenization_utils import BatchEncoding
 
 
@@ -880,7 +889,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING,
 )
-class TFBertForSequenceClassification(TFBertPreTrainedModel):
+class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -891,9 +900,25 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def call(self, inputs, **kwargs):
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
@@ -916,21 +941,35 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        outputs = self.bert(inputs, **kwargs)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
+        pooled_output = self.dropout(pooled_output, training=training)
         logits = self.classifier(pooled_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # logits, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -938,7 +977,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     BERT_START_DOCSTRING,
 )
-class TFBertForMultipleChoice(TFBertPreTrainedModel):
+class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -966,9 +1005,15 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
         position_ids=None,
         head_mask=None,
         inputs_embeds=None,
+        labels=None,
         training=False,
     ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
@@ -993,15 +1038,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='tf', pad_to_max_length=True)
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
 
-        # linear classifier on the output is not yet trained
-        outputs = model(encoding['input_ids'][None, :])
-        logits = outputs[0]
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -1011,7 +1055,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
             head_mask = inputs[4] if len(inputs) > 4 else head_mask
             inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
             assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
             attention_mask = inputs.get("attention_mask", attention_mask)
             token_type_ids = inputs.get("token_type_ids", token_type_ids)
@@ -1053,7 +1097,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
 
         outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # reshaped_logits, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -1061,7 +1109,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     BERT_START_DOCSTRING,
 )
-class TFBertForTokenClassification(TFBertPreTrainedModel):
+class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -1072,9 +1120,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def call(self, inputs, **kwargs):
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -1098,20 +1160,33 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
 
         """
-        outputs = self.bert(inputs, **kwargs)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
+        sequence_output = self.dropout(sequence_output, training=training)
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # scores, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -1119,7 +1194,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     BERT_START_DOCSTRING,
 )
-class TFBertForQuestionAnswering(TFBertPreTrainedModel):
+class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -1129,9 +1204,32 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
         )
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    def call(self, inputs, **kwargs):
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
@@ -1156,18 +1254,24 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
 
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1])
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
         assert answer == "a nice puppet"
 
         """
-        outputs = self.bert(inputs, **kwargs)
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
@@ -1178,4 +1282,10 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
 
         outputs = (start_logits, end_logits,) + outputs[2:]
 
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_camembert.py b/src/transformers/modeling_tf_camembert.py
index e0c44371bcc..e7a5a1d38e8 100644
--- a/src/transformers/modeling_tf_camembert.py
+++ b/src/transformers/modeling_tf_camembert.py
@@ -22,6 +22,8 @@ from .configuration_camembert import CamembertConfig
 from .file_utils import add_start_docstrings
 from .modeling_tf_roberta import (
     TFRobertaForMaskedLM,
+    TFRobertaForMultipleChoice,
+    TFRobertaForQuestionAnswering,
     TFRobertaForSequenceClassification,
     TFRobertaForTokenClassification,
     TFRobertaModel,
@@ -114,3 +116,30 @@ class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
     """
 
     config_class = CamembertConfig
+
+
+@add_start_docstrings(
+    """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    CAMEMBERT_START_DOCSTRING,
+)
+class TFCamembertForMultipleChoice(TFRobertaForMultipleChoice):
+    """
+    This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    config_class = CamembertConfig
+
+
+@add_start_docstrings(
+    """CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    CAMEMBERT_START_DOCSTRING,
+)
+class TFCamembertForQuestionAnswering(TFRobertaForQuestionAnswering):
+    """
+    This class overrides :class:`~transformers.TFRobertaForQuestionAnswering`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    config_class = CamembertConfig
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index 2a34d4f5148..7cdeb4ae964 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -23,8 +23,18 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
+from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 from .tokenization_utils import BatchEncoding
 
 
@@ -399,7 +409,10 @@ class TFTransformer(tf.keras.layers.Layer):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
+@keras_serializable
 class TFDistilBertMainLayer(tf.keras.layers.Layer):
+    config_class = DistilBertConfig
+
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.num_hidden_layers = config.num_hidden_layers
@@ -662,7 +675,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     DISTILBERT_START_DOCSTRING,
 )
-class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
+class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -680,8 +693,16 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
         self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
         logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
@@ -705,20 +726,32 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
         model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        distilbert_output = self.distilbert(inputs, **kwargs)
+        distilbert_output = self.distilbert(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
         pooled_output = hidden_state[:, 0]  # (bs, dim)
         pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=training)  # (bs, dim)
         logits = self.classifier(pooled_output)  # (bs, dim)
 
         outputs = (logits,) + distilbert_output[1:]
-        return outputs  # logits, (hidden_states), (attentions)
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -726,7 +759,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     DISTILBERT_START_DOCSTRING,
 )
-class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
+class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -738,8 +771,14 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -762,20 +801,154 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
         model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
         """
-        outputs = self.distilbert(inputs, **kwargs)
+        outputs = self.distilbert(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
+        sequence_output = self.dropout(sequence_output, training=training)
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # scores, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """DistilBert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    DISTILBERT_START_DOCSTRING,
+)
+class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.pre_classifier = tf.keras.layers.Dense(
+            config.dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="relu",
+            name="pre_classifier",
+        )
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    def call(
+        self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
+            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
+
+        """
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            head_mask = inputs[2] if len(inputs) > 2 else head_mask
+            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
+            assert len(inputs) <= 4, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 4, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            head_mask,
+            inputs_embeds,
+        ]
+
+        distilbert_output = self.distilbert(flat_inputs, training=training)
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output, training=training)  # (bs, dim)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + distilbert_output[1:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -783,7 +956,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     DISTILBERT_START_DOCSTRING,
 )
-class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
+class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -795,8 +968,29 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
         self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
         start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
@@ -821,19 +1015,35 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
         model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
+
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
 
         """
-        distilbert_output = self.distilbert(inputs, **kwargs)
+        distilbert_output = self.distilbert(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
-        hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False))  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states, training=training)  # (bs, max_query_len, dim)
         logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
         start_logits, end_logits = tf.split(logits, 2, axis=-1)
         start_logits = tf.squeeze(start_logits, axis=-1)
         end_logits = tf.squeeze(end_logits, axis=-1)
 
         outputs = (start_logits, end_logits,) + distilbert_output[1:]
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py
index 37dae2c6475..d26e81d9734 100644
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -6,7 +6,13 @@ from transformers import ElectraConfig
 
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
-from .modeling_tf_utils import get_initializer, shape_list
+from .modeling_tf_utils import (
+    TFQuestionAnsweringLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 from .tokenization_utils import BatchEncoding
 
 
@@ -194,6 +200,7 @@ class TFElectraPreTrainedModel(TFBertPreTrainedModel):
         return head_mask
 
 
+@keras_serializable
 class TFElectraMainLayer(TFElectraPreTrainedModel):
 
     config_class = ElectraConfig
@@ -557,13 +564,15 @@ Electra model with a token classification head on top.
 Both the discriminator and generator may be loaded into this model.""",
     ELECTRA_START_DOCSTRING,
 )
-class TFElectraForTokenClassification(TFElectraPreTrainedModel):
+class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss):
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
 
         self.electra = TFElectraMainLayer(config, name="electra")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
     def call(
@@ -574,9 +583,14 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
         position_ids=None,
         head_mask=None,
         inputs_embeds=None,
+        labels=None,
         training=False,
     ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -599,9 +613,11 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
 
         tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
         model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
         """
 
         discriminator_hidden_states = self.electra(
@@ -610,7 +626,106 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
         discriminator_sequence_output = discriminator_hidden_states[0]
         discriminator_sequence_output = self.dropout(discriminator_sequence_output)
         logits = self.classifier(discriminator_sequence_output)
-        output = (logits,)
-        output += discriminator_hidden_states[1:]
 
-        return output  # (loss), scores, (hidden_states), (attentions)
+        outputs = (logits,) + discriminator_hidden_states[1:]
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
+        r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
+        model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
+
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
+
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
+
+        """
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        logits = self.qa_outputs(discriminator_sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:]
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py
index be4cd27a31a..ee3d78d0f24 100644
--- a/src/transformers/modeling_tf_flaubert.py
+++ b/src/transformers/modeling_tf_flaubert.py
@@ -22,13 +22,16 @@ import tensorflow as tf
 
 from .configuration_flaubert import FlaubertConfig
 from .file_utils import add_start_docstrings
+from .modeling_tf_utils import keras_serializable, shape_list
 from .modeling_tf_xlm import (
+    TFXLMForMultipleChoice,
+    TFXLMForQuestionAnsweringSimple,
     TFXLMForSequenceClassification,
+    TFXLMForTokenClassification,
     TFXLMMainLayer,
     TFXLMModel,
     TFXLMWithLMHeadModel,
     get_masks,
-    shape_list,
 )
 from .tokenization_utils import BatchEncoding
 
@@ -112,6 +115,7 @@ class TFFlaubertModel(TFXLMModel):
         self.transformer = TFFlaubertMainLayer(config, name="transformer")
 
 
+@keras_serializable
 class TFFlaubertMainLayer(TFXLMMainLayer):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
@@ -327,3 +331,38 @@ class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFFlaubertMainLayer(config, name="transformer")
+
+
+@add_start_docstrings(
+    """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertForQuestionAnsweringSimple(TFXLMForQuestionAnsweringSimple):
+    config_class = FlaubertConfig
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+
+
+@add_start_docstrings(
+    """Flaubert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertForTokenClassification(TFXLMForTokenClassification):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+
+
+@add_start_docstrings(
+    """Flaubert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertForMultipleChoice(TFXLMForMultipleChoice):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index 5f8eabee652..35e7ca309b0 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -29,6 +29,7 @@ from .modeling_tf_utils import (
     TFSequenceSummary,
     TFSharedEmbeddings,
     get_initializer,
+    keras_serializable,
     shape_list,
 )
 from .tokenization_utils import BatchEncoding
@@ -199,7 +200,10 @@ class TFBlock(tf.keras.layers.Layer):
         return outputs  # x, (attentions)
 
 
+@keras_serializable
 class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
+    config_class = OpenAIGPTConfig
+
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
         self.output_hidden_states = config.output_hidden_states
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index 40446d9ea44..1f9eec0b724 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -21,9 +21,18 @@ import logging
 import tensorflow as tf
 
 from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
+from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -82,6 +91,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
         return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
 
 
+@keras_serializable
 class TFRobertaMainLayer(TFBertMainLayer):
     """
     Same as TFBertMainLayer but uses TFRobertaEmbeddings.
@@ -337,7 +347,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
     on top of the pooled output) e.g. for GLUE tasks. """,
     ROBERTA_START_DOCSTRING,
 )
-class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
+class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -346,7 +356,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
         self.classifier = TFRobertaClassificationHead(config, name="classifier")
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
@@ -370,20 +390,164 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.constant([1])[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        outputs = self.roberta(inputs, **kwargs)
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
-        logits = self.classifier(sequence_output, training=kwargs.get("training", False))
+        logits = self.classifier(sequence_output, training=training)
 
         outputs = (logits,) + outputs[2:]
 
-        return outputs  # logits, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    ROBERTA_START_DOCSTRING,
+)
+class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFBertMainLayer(config, name="roberta")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
+            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
+
+        """
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
+
+        outputs = self.roberta(flat_inputs, training=training)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -391,7 +555,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     ROBERTA_START_DOCSTRING,
 )
-class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
+class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -403,8 +567,22 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -428,27 +606,40 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
 
         """
-        outputs = self.roberta(inputs, **kwargs)
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
+        sequence_output = self.dropout(sequence_output, training=training)
         logits = self.classifier(sequence_output)
 
         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # scores, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
     """RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
     ROBERTA_START_DOCSTRING,
 )
-class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
+class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -459,8 +650,31 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
         start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
@@ -488,14 +702,23 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
-        input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet")
-        start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
 
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
         answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
 
         """
-        outputs = self.roberta(inputs, **kwargs)
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = outputs[0]
 
@@ -506,4 +729,10 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
 
         outputs = (start_logits, end_logits,) + outputs[2:]
 
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index f59d26bed7d..948174a0f44 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -25,7 +25,8 @@ import tensorflow as tf
 
 from .configuration_t5 import T5Config
 from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, keras_serializable, shape_list
+from .tokenization_utils import BatchEncoding
 
 
 logger = logging.getLogger(__name__)
@@ -502,7 +503,10 @@ class _NoLayerEmbedTokens(object):
 # The full model without a specific pretrained or finetuning head is
 # provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
 ####################################################
+@keras_serializable
 class TFT5MainLayer(tf.keras.layers.Layer):
+    config_class = T5Config
+
     def __init__(self, config, embed_tokens=None, **kwargs):
         super().__init__(**kwargs)
         self.output_attentions = config.output_attentions
@@ -548,12 +552,32 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         use_cache=False,
         training=False,
     ):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            encoder_hidden_states = inputs[2] if len(inputs) > 2 else encoder_hidden_states
+            encoder_attention_mask = inputs[3] if len(inputs) > 3 else encoder_attention_mask
+            inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds
+            head_mask = inputs[5] if len(inputs) > 5 else head_mask
+            past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
+            assert len(inputs) <= 7, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("decoder_input_ids")
+            attention_mask = inputs.get("decoder_attention_mask", attention_mask)
+            encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states)
+            encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            head_mask = inputs.get("head_mask", head_mask)
+            past_key_value_states = inputs.get("past_key_value_states", past_key_value_states)
+            assert len(inputs) <= 7, "Too many inputs."
+        else:
+            input_ids = inputs
 
-        if inputs is not None and inputs_embeds is not None:
+        if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both inputs and inputs_embeds at the same time")
-        elif inputs is not None:
-            input_shape = shape_list(inputs)
-            inputs = tf.reshape(inputs, (-1, input_shape[-1]))
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
         elif inputs_embeds is not None:
             input_shape = shape_list(inputs_embeds)[:-1]
         else:
@@ -561,7 +585,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
 
         if inputs_embeds is None:
             assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
-            inputs_embeds = self.embed_tokens(inputs)
+            inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
 
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 038b768a8cc..2f1adb241f6 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -734,7 +734,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
         return outputs
 
 
-class TFTransfoXLLMHead(tf.keras.layers.Layer):
+class TFTransfoXLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
         super().__init__(**kwargs)
         self.vocab_size = config.vocab_size
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index c2fd740ad48..19e9ceecbff 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -84,6 +84,7 @@ def keras_serializable(cls):
         else:
             raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)")
         self._transformers_config = config
+        self._kwargs = kwargs
 
     cls.__init__ = wrapped_init
 
@@ -94,6 +95,7 @@ def keras_serializable(cls):
         def get_config(self):
             cfg = super(cls, self).get_config()
             cfg["transformers_config"] = self._transformers_config.to_dict()
+            cfg.update(self._kwargs)
             return cfg
 
         cls.get_config = get_config
@@ -104,6 +106,44 @@ def keras_serializable(cls):
     return cls
 
 
+class TFQuestionAnsweringLoss:
+    def compute_loss(self, labels, logits):
+        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
+            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+        )
+        start_loss = loss_fn(labels["start_position"], logits[0])
+        end_loss = loss_fn(labels["end_position"], logits[1])
+
+        return (start_loss + end_loss) / 2.0
+
+
+class TFTokenClassificationLoss:
+    def compute_loss(self, labels, logits):
+        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
+            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+        )
+        active_loss = tf.reshape(labels, (-1,)) != -1
+        reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
+        labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
+
+        return loss_fn(labels, reduced_logits)
+
+
+class TFSequenceClassificationLoss:
+    def compute_loss(self, labels, logits):
+        if shape_list(logits)[1] == 1:
+            loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
+        else:
+            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
+                from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+            )
+
+        return loss_fn(labels, logits)
+
+
+TFMultipleChoiceLoss = TFSequenceClassificationLoss
+
+
 class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
     r""" Base class for all TF models.
 
@@ -1531,6 +1571,16 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
         )
         super().build(input_shape)
 
+    def get_config(self):
+        config = {
+            "vocab_size": self.vocab_size,
+            "hidden_size": self.hidden_size,
+            "initializer_range": self.initializer_range,
+        }
+        base_config = super().get_config()
+
+        return dict(list(base_config.items()) + list(config.items()))
+
     def call(self, inputs, mode="embedding"):
         """Get token embeddings of inputs.
         Args:
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 659aa15022b..ce11ebe8965 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -24,8 +24,19 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list
+from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    shape_list,
+)
 from .tokenization_utils import BatchEncoding
 
 
@@ -198,7 +209,10 @@ class TFTransformerFFN(tf.keras.layers.Layer):
         return x
 
 
+@keras_serializable
 class TFXLMMainLayer(tf.keras.layers.Layer):
+    config_class = XLMConfig
+
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.output_attentions = config.output_attentions
@@ -717,7 +731,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     XLM_START_DOCSTRING,
 )
-class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
+class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -726,8 +740,27 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
         self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
         logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
@@ -751,19 +784,261 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.constant([1])[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
         output = transformer_outputs[0]
 
         logits = self.sequence_summary(output)
 
         outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-        return outputs
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """XLM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
+            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMForMultipleChoice
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
+
+        """
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            langs = inputs[2] if len(inputs) > 2 else langs
+            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
+            position_ids = inputs[4] if len(inputs) > 4 else position_ids
+            lengths = inputs[5] if len(inputs) > 5 else lengths
+            cache = inputs[6] if len(inputs) > 6 else cache
+            head_mask = inputs[7] if len(inputs) > 7 else head_mask
+            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
+            assert len(inputs) <= 9, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            langs = inputs.get("langs", langs)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            lengths = inputs.get("lengths", lengths)
+            cache = inputs.get("cache", cache)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 9, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            langs,
+            flat_token_type_ids,
+            flat_position_ids,
+            lengths,
+            cache,
+            head_mask,
+            inputs_embeds,
+        ]
+
+        transformer_outputs = self.transformer(flat_inputs, training=training)
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """XLM Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLMTokenizer, TFXLMForTokenClassification
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+        """
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
+
+        sequence_output = transformer_outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + transformer_outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -771,7 +1046,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     XLM_START_DOCSTRING,
 )
-class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
+class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -780,8 +1055,34 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
         start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
@@ -807,12 +1108,27 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
+
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
 
         sequence_output = transformer_outputs[0]
 
@@ -825,4 +1141,10 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
             1:
         ]  # Keep mems, hidden states, attentions if there are in it
 
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_xlm_roberta.py b/src/transformers/modeling_tf_xlm_roberta.py
index dee354e0ead..46bc96950cd 100644
--- a/src/transformers/modeling_tf_xlm_roberta.py
+++ b/src/transformers/modeling_tf_xlm_roberta.py
@@ -22,6 +22,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig
 from .file_utils import add_start_docstrings
 from .modeling_tf_roberta import (
     TFRobertaForMaskedLM,
+    TFRobertaForMultipleChoice,
+    TFRobertaForQuestionAnswering,
     TFRobertaForSequenceClassification,
     TFRobertaForTokenClassification,
     TFRobertaModel,
@@ -114,3 +116,30 @@ class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
     """
 
     config_class = XLMRobertaConfig
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class TFXLMRobertaForQuestionAnswering(TFRobertaForQuestionAnswering):
+    """
+    This class overrides :class:`~transformers.TFRobertaForQuestionAnsweringSimple`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
+
+
+@add_start_docstrings(
+    """Roberta Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class TFXLMRobertaForMultipleChoice(TFRobertaForMultipleChoice):
+    """
+    This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the
+    superclass for the appropriate documentation alongside usage examples.
+    """
+
+    config_class = XLMRobertaConfig
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index 15a4566f72b..7964b5fa85d 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -23,11 +23,15 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
+    TFMultipleChoiceLoss,
     TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
     TFSequenceSummary,
     TFSharedEmbeddings,
+    TFTokenClassificationLoss,
     get_initializer,
     keras_serializable,
     shape_list,
@@ -938,7 +942,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     XLNET_START_DOCSTRING,
 )
-class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
+class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -952,8 +956,28 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=True,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
         logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
@@ -981,12 +1005,24 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+        )
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
@@ -994,7 +1030,159 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
 
         outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
-        return outputs  # return logits, (mems), (hidden states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """XLNET Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.logits_proj = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    def call(
+        self,
+        inputs,
+        token_type_ids=None,
+        input_mask=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=True,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
+            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+        model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+
+        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
+        labels = tf.reshape(tf.constant(1), (-1, 1))
+        outputs = model(input_ids, labels=labels)
+
+        loss, classification_scores = outputs[:2]
+
+        """
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            mems = inputs[2] if len(inputs) > 2 else mems
+            perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
+            target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
+            token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
+            input_mask = inputs[6] if len(inputs) > 6 else input_mask
+            head_mask = inputs[7] if len(inputs) > 7 else head_mask
+            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
+            use_cache = inputs[9] if len(inputs) > 9 else use_cache
+            assert len(inputs) <= 10, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            mems = inputs.get("mems", mems)
+            perm_mask = inputs.get("perm_mask", perm_mask)
+            target_mapping = inputs.get("target_mapping", target_mapping)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            input_mask = inputs.get("input_mask", input_mask)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            use_cache = inputs.get("use_cache", use_cache)
+            assert len(inputs) <= 10, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_input_mask = tf.reshape(input_mask, (-1, seq_length)) if input_mask is not None else None
+
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            mems,
+            perm_mask,
+            target_mapping,
+            flat_token_type_ids,
+            flat_input_mask,
+            head_mask,
+            inputs_embeds,
+            use_cache,
+        ]
+
+        transformer_outputs = self.transformer(flat_inputs, training=training)
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss = self.compute_loss(labels, reshaped_logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (mems), (hidden states), (attentions)
 
 
 @add_start_docstrings(
@@ -1002,7 +1190,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     XLNET_START_DOCSTRING,
 )
-class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
+class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificationLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -1012,8 +1200,26 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=True,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
         logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
@@ -1041,19 +1247,36 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            training=training,
+        )
         output = transformer_outputs[0]
 
         logits = self.classifier(output)
 
         outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
-        return outputs  # return logits, (mems), (hidden states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
@@ -1061,7 +1284,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     XLNET_START_DOCSTRING,
 )
-class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
+class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnsweringLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -1070,8 +1293,35 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
         )
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=True,
+        start_positions=None,
+        end_positions=None,
+        cls_index=None,
+        p_mask=None,
+        is_impossible=None,
+        training=False,
+    ):
         r"""
+        start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
         loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
@@ -1103,12 +1353,27 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
         model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
+        start_scores, end_scores = model(input_dict)
+
+        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            training=training,
+        )
 
         sequence_output = transformer_outputs[0]
 
@@ -1121,7 +1386,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
             1:
         ]  # Keep mems, hidden states, attentions if there are in it
 
-        return outputs  # start_logits, end_logits, (mems), (hidden_states), (attentions)
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.compute_loss(labels, outputs[:2])
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions)
 
 
 # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 93c1048db07..47b5370f0c2 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -58,27 +58,41 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
         }
 
 
-def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type="adamw"):
+def create_optimizer(
+    init_lr,
+    num_train_steps,
+    num_warmup_steps,
+    min_lr_ratio=0.0,
+    adam_epsilon=1e-8,
+    weight_decay_rate=0.0,
+    include_in_weight_decay=None,
+):
     """Creates an optimizer with learning rate schedule."""
     # Implements linear decay of the learning rate.
     lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
-        initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr,
+        initial_learning_rate=init_lr,
+        decay_steps=num_train_steps - num_warmup_steps,
+        end_learning_rate=init_lr * min_lr_ratio,
     )
     if num_warmup_steps:
         lr_schedule = WarmUp(
             initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps,
         )
-
-    optimizer = AdamWeightDecay(
-        learning_rate=lr_schedule,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
-    )
-
-    return optimizer
+    if weight_decay_rate > 0.0:
+        optimizer = AdamWeightDecay(
+            learning_rate=lr_schedule,
+            weight_decay_rate=weight_decay_rate,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=adam_epsilon,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
+            include_in_weight_decay=include_in_weight_decay,
+        )
+    else:
+        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=adam_epsilon)
+    # We return the optimizer and the LR scheduler in order to better track the
+    # evolution of the LR independently of the optimizer.
+    return optimizer, lr_schedule
 
 
 class AdamWeightDecay(tf.keras.optimizers.Adam):
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index 7907f030773..e25bd4d5d7a 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -3,12 +3,12 @@
 import logging
 import math
 import os
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Tuple
 
 import numpy as np
 import tensorflow as tf
 
-from .modeling_tf_utils import TFPreTrainedModel, shape_list
+from .modeling_tf_utils import TFPreTrainedModel
 from .optimization_tf import GradientAccumulator, create_optimizer
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput
 from .training_args_tf import TFTrainingArguments
@@ -20,13 +20,14 @@ logger = logging.getLogger(__name__)
 class TFTrainer:
     model: TFPreTrainedModel
     args: TFTrainingArguments
-    # something similar to a PT Dataset.
-    # This is just temporary before to have
-    # a framework-agnostic approach for datasets.
     train_dataset: Optional[tf.data.Dataset]
     eval_dataset: Optional[tf.data.Dataset]
     compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
     prediction_loss_only: bool
+    tb_writer: Optional[tf.summary.SummaryWriter] = None
+    optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None
+    global_step: Optional[int] = None
+    epoch: Optional[float] = None
 
     def __init__(
         self,
@@ -36,6 +37,8 @@ class TFTrainer:
         eval_dataset: Optional[tf.data.Dataset] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         prediction_loss_only=False,
+        tb_writer: Optional[tf.summary.SummaryWriter] = None,
+        optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None,
     ):
         self.model = model
         self.args = args
@@ -43,120 +46,73 @@ class TFTrainer:
         self.eval_dataset = eval_dataset
         self.compute_metrics = compute_metrics
         self.prediction_loss_only = prediction_loss_only
+        self.optimizers = optimizers
         self.gradient_accumulator = GradientAccumulator()
 
-        self._setup_training()
-
-    def _setup_training(self) -> None:
-        """
-        Setup the different steps to train a model:
-          - check if all the data are given
-          - create the proper strategy
-          - create the features
-          - prepare the model settings
-        """
-        self._prepare_dataset()
-
-        with self.args.strategy.scope():
-            self._create_optimizer()
-            _ = self.optimizer.iterations
-            self._set_loss_and_metric()
-            self._create_checkpoint_manager()
-            self._create_summary_writer()
-
-    def _set_loss_and_metric(self) -> None:
-        """
-        Create the training loss and metric with their name. Allowed names are those listed
-        in the Tensorflow documentation and those contained in the transformers library.
-        """
-        try:
-            self.loss = tf.keras.losses.get(
-                {
-                    "class_name": self.args.loss_name,
-                    "config": {"from_logits": True, "reduction": tf.keras.losses.Reduction.NONE},
-                }
-            )
-        except TypeError:
-            self.loss = tf.keras.losses.get(
-                {"class_name": self.args.loss_name, "config": {"reduction": tf.keras.losses.Reduction.NONE}}
-            )
-
-    def _create_summary_writer(self) -> None:
-        """
-        Create a summary writer to be able to read the logs in Tensorboard.
-        """
-        self.writer = tf.summary.create_file_writer(self.args.logging_dir)
-
-    def _prepare_dataset(self) -> None:
-        """
-        Prepare the training, validation and test data.
-        """
-        if self.train_dataset is not None:
-            self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
-
-            if self.args.max_steps > 0:
-                self.train_steps = self.args.max_steps
-            else:
-                self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
-
-            self.train_dataset = (
-                self.train_dataset.cache()
-                .shuffle(self.num_train_examples)
-                .batch(self.args.train_batch_size)
-                .prefetch(tf.data.experimental.AUTOTUNE)
-            )
-
-            if self.args.max_steps > 0:
-                self.train_dataset = self.train_dataset.repeat(-1)
-
-            self.train_dataset = self.args.strategy.experimental_distribute_dataset(self.train_dataset)
+        if tb_writer is not None:
+            self.tb_writer = tb_writer
         else:
-            self.train_steps = 0
+            self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir)
 
-        if self.eval_dataset is not None:
-            self.eval_dataset = (
-                self.eval_dataset.batch(self.args.eval_batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
-            )
-            self.eval_dataset = self.args.strategy.experimental_distribute_dataset(self.eval_dataset)
+    def get_train_tfdataset(self) -> tf.data.Dataset:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
 
-    def _create_optimizer(self) -> None:
-        """
-        Create the training optimizer with its name. Allowed names are those listed
-        in the Tensorflow documentation and those contained in the transformers library.
-        """
-        if self.args.optimizer_name == "adamw":
-            self.optimizer = create_optimizer(
-                self.args.learning_rate, self.train_steps, self.args.warmup_steps, self.args.end_lr
-            )
+        self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
+
+        if self.args.max_steps > 0:
+            self.train_steps = self.args.max_steps
         else:
-            try:
-                self.optimizer = tf.keras.optimizers.get(
-                    {
-                        "class_name": self.args.optimizer_name,
-                        "config": {"learning_rate": self.args.learning_rate, "epsilon": self.args.adam_epsilon},
-                    }
-                )
-            except TypeError:
-                # This is for the case where the optimizer is not Adam-like such as SGD
-                self.optimizer = tf.keras.optimizers.get(
-                    {"class_name": self.args.optimizer_name, "config": {"learning_rate": self.args.learning_rate}}
-                )
-        logger.info("Created an/a {} optimizer".format(self.args.optimizer_name))
+            self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size)
 
-    def _create_checkpoint_manager(self, max_to_keep: int = 5, load_model: bool = True) -> None:
+        ds = (
+            self.train_dataset.cache()
+            .shuffle(self.num_train_examples)
+            .batch(self.args.train_batch_size)
+            .prefetch(tf.data.experimental.AUTOTUNE)
+        )
+
+        if self.args.max_steps > 0:
+            self.train_dataset = self.train_dataset.repeat(-1)
+
+        return self.args.strategy.experimental_distribute_dataset(ds)
+
+    def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        ds = eval_dataset.cache().batch(self.args.eval_batch_size).prefetch(tf.data.experimental.AUTOTUNE)
+
+        return self.args.strategy.experimental_distribute_dataset(ds)
+
+    def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
+        ds = test_dataset.batch(self.args.eval_batch_size)
+
+        return self.args.strategy.experimental_distribute_dataset(ds)
+
+    def get_optimizers(
+        self,
+    ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]:
         """
-        Create a checkpoint manager in order to be able to make the training
-        fault-tolerant.
-        Args:
-          max_to_keep: the maximum number of checkpoints to keep in the checkpoint path.
-          load_model: if we want to start the training from the latest checkpoint.
+        Setup the optimizer and the learning rate scheduler.
+
+        We provide a reasonable default that works well.
+        If you want to use something else, you can pass a tuple in the Trainer's init,
+        or override this method in a subclass.
         """
-        ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
+        if self.optimizers is not None:
+            return self.optimizers
 
-        self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=max_to_keep)
+        optimizer, scheduler = create_optimizer(
+            self.args.learning_rate,
+            self.train_steps,
+            self.args.warmup_steps,
+            adam_epsilon=self.args.adam_epsilon,
+            weight_decay_rate=self.args.weight_decay,
+        )
 
-        if load_model:
-            ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
+        return optimizer, scheduler
 
     @tf.function
     def _evaluate_steps(self, per_replica_features, per_replica_labels):
@@ -182,6 +138,14 @@ class TFTrainer:
     def _prediction_loop(
         self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None
     ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+
+        Works both with or without labels.
+        """
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
+
         logger.info("***** Running %s *****", description)
         logger.info("  Batch size = %d", self.args.eval_batch_size)
 
@@ -196,6 +160,12 @@ class TFTrainer:
             loss = tf.reduce_mean(loss)
 
             if not prediction_loss_only:
+                if isinstance(logits, tuple):
+                    logits = logits[0]
+
+                if isinstance(labels, tuple):
+                    labels = labels[0]
+
                 if self.args.n_gpu > 1:
                     for val in logits.values:
                         if preds is None:
@@ -240,10 +210,9 @@ class TFTrainer:
         """
         Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
         """
-        if eval_dataset is None:
-            eval_dataset = self.eval_dataset
+        eval_ds = self.get_eval_tfdataset(eval_dataset)
 
-        output = self._prediction_loop(eval_dataset, description="Evaluation")
+        output = self._prediction_loop(eval_ds, description="Evaluation")
 
         return output.metrics
 
@@ -251,12 +220,25 @@ class TFTrainer:
         """
         Train method to train the model.
         """
+        train_ds = self.get_train_tfdataset()
+
         if self.args.debug:
             tf.summary.trace_on(graph=True, profiler=True)
 
         self.gradient_accumulator.reset()
 
-        iterations = self.optimizer.iterations
+        with self.args.strategy.scope():
+            optimizer, lr_scheduler = self.get_optimizers()
+            iterations = optimizer.iterations
+            ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model)
+            self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, PREFIX_CHECKPOINT_DIR, max_to_keep=5)
+
+            if self.model.ckpt_manager.latest_checkpoint:
+                logger.info(
+                    "Checkpoint file %s found and restoring from checkpoint", self.model.ckpt_manager.latest_checkpoint
+                )
+
+                ckpt.restore(self.model.ckpt_manager.latest_checkpoint).expect_partial()
 
         if iterations.numpy() > 0:
             logger.info("Start the training from the last checkpoint")
@@ -268,21 +250,30 @@ class TFTrainer:
 
         epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs
 
+        if self.args.fp16:
+            policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
+            tf.keras.mixed_precision.experimental.set_policy(policy)
+
+        with self.tb_writer.as_default():
+            tf.summary.text("args", self.args.to_json_string())
+
+        self.tb_writer.flush()
+
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", self.num_train_examples)
         logger.info("  Num Epochs = %d", epochs)
         logger.info("  Total optimization steps = %d", self.train_steps)
 
         for epoch in range(start_epoch, int(epochs + 1)):
-            for training_loss in self._training_steps():
+            for training_loss in self._training_steps(train_ds, optimizer):
                 step = iterations.numpy()
 
                 if self.args.debug:
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                         tf.summary.scalar("loss", training_loss, step=step)
 
                 if step == 1 and self.args.debug:
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                         tf.summary.trace_export(name="training", step=step, profiler_outdir=self.args.logging_dir)
 
                 if self.args.evaluate_during_training and step % self.args.eval_steps == 0:
@@ -293,17 +284,16 @@ class TFTrainer:
                         eval_key = "eval_{}".format(key)
                         logs[eval_key] = value
 
-                    if callable(self.optimizer.learning_rate):
-                        logs["learning_rate"] = self.optimizer.learning_rate(step).numpy()
-                    else:
-                        logs["learning_rate"] = self.optimizer.learning_rate.numpy()
+                    logs["learning_rate"] = lr_scheduler(step).numpy()
 
                     logger.info("Epoch {} Step {} Validation Metrics {}".format(epoch, step, logs))
 
-                    with self.writer.as_default():
+                    with self.tb_writer.as_default():
                         for k, v in logs.items():
                             tf.summary.scalar(k, v, step=step)
 
+                    self.tb_writer.flush()
+
                 if step % self.args.logging_steps == 0:
                     logger.info("Epoch {} Step {} Train Loss {:.4f}".format(epoch, step, training_loss.numpy()))
 
@@ -314,21 +304,21 @@ class TFTrainer:
                 if step % self.train_steps == 0:
                     break
 
-    def _training_steps(self):
+    def _training_steps(self, ds, optimizer):
         """
         Returns a generator over training steps (i.e. parameters update).
         """
-        for i, loss in enumerate(self._accumulate_next_gradients()):
+        for i, loss in enumerate(self._accumulate_next_gradients(ds)):
             if i % self.args.gradient_accumulation_steps == 0:
-                self._apply_gradients()
+                self._apply_gradients(optimizer)
                 yield loss
 
     @tf.function
-    def _apply_gradients(self):
+    def _apply_gradients(self, optimizer):
         """Applies the gradients (cross-replica)."""
-        self.args.strategy.experimental_run_v2(self._step)
+        self.args.strategy.experimental_run_v2(self._step, args=(optimizer,))
 
-    def _step(self):
+    def _step(self, optimizer):
         """Applies gradients and resets accumulation."""
         gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync
         gradients = [
@@ -336,12 +326,12 @@ class TFTrainer:
         ]
         gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients]
 
-        self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
+        optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables)))
         self.gradient_accumulator.reset()
 
-    def _accumulate_next_gradients(self):
+    def _accumulate_next_gradients(self, ds):
         """Accumulates the gradients from the next element in dataset."""
-        iterator = iter(self.train_dataset)
+        iterator = iter(ds)
 
         @tf.function
         def _accumulate_next():
@@ -388,23 +378,10 @@ class TFTrainer:
           labels: the batched labels.
           training: run the model in training mode or not
         """
-        if self.args.mode == "text-classification" or self.args.mode == "token-classification":
-            logits = self.model(features, training=training)[0]
+        if isinstance(labels, (dict)):
+            loss, logits = self.model(features, training=training, **labels)[:2]
         else:
-            logits = self.model(features, training=training)
-
-        if self.args.mode == "token-classification":
-            active_loss = tf.reshape(labels, (-1,)) != -1
-            reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
-            labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
-            loss = self.loss(labels, reduced_logits)
-        elif self.args.mode == "question-answering":
-            start_loss = self.loss(labels["start_position"], logits[0])
-            end_loss = self.loss(labels["end_position"], logits[1])
-            loss = (start_loss + end_loss) / 2.0
-        else:
-            loss = self.loss(labels, logits)
-
+            loss, logits = self.model(features, labels=labels, training=training)[:2]
         loss += sum(self.model.losses) * (1.0 / self.args.n_gpu)
 
         return loss, logits
@@ -418,19 +395,24 @@ class TFTrainer:
           test_dataset: something similar to a PT Dataset. This is just
             temporary before to have a framework-agnostic approach for datasets.
         """
-        test_dataset = test_dataset.batch(self.args.eval_batch_size)
-        test_dataset = self.args.strategy.experimental_distribute_dataset(test_dataset)
+        test_ds = self.get_test_tfdataset(test_dataset)
 
-        return self._prediction_loop(test_dataset, description="Prediction")
+        return self._prediction_loop(test_ds, description="Prediction")
 
-    def save_model(self) -> None:
+    def save_model(self, output_dir: Optional[str] = None):
         """
         Save the pretrained model and create a Tensorflow saved model.
         """
-        logger.info("Saving model in {}".format(self.args.output_dir))
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+
+        logger.info("Saving model in {}".format(output_dir))
 
         path = os.path.join(self.args.output_dir, "saved_model")
 
         logger.info("Saving model in {}".format(path))
         os.makedirs(path, exist_ok=True)
+
+        if not isinstance(self.model, TFPreTrainedModel):
+            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
+
         self.model.save_pretrained(self.args.output_dir)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index c571f1c083b..da6862f2c92 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1,6 +1,7 @@
 import dataclasses
 import json
 import logging
+import os
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Tuple
 
@@ -27,6 +28,17 @@ def is_tpu_available():
 logger = logging.getLogger(__name__)
 
 
+def default_logdir() -> str:
+    """
+    Same default as PyTorch
+    """
+    import socket
+    from datetime import datetime
+
+    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+    return os.path.join("runs", current_time + "_" + socket.gethostname())
+
+
 @dataclass
 class TrainingArguments:
     """
@@ -97,7 +109,7 @@ class TrainingArguments:
     )
     warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
 
-    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
+    logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
     logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
     logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
     save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 470922bd356..c05acafdc7e 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -14,28 +14,9 @@ if is_tf_available():
 
 @dataclass
 class TFTrainingArguments(TrainingArguments):
-    optimizer_name: str = field(
-        default="adam",
-        metadata={
-            "help": 'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"'
-        },
-    )
-    mode: str = field(
-        default="text-classification",
-        metadata={"help": 'Type of task, one of "text-classification", "token-classification", "question-answering"'},
-    )
-    loss_name: str = field(
-        default="SparseCategoricalCrossentropy",
-        metadata={
-            "help": "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses"
-        },
-    )
     tpu_name: str = field(
         default=None, metadata={"help": "Name of TPU"},
     )
-    end_lr: float = field(
-        default=0, metadata={"help": "End learning rate for optimizer"},
-    )
     eval_steps: int = field(default=1000, metadata={"help": "Run an evaluation every X steps."})
     debug: bool = field(
         default=False, metadata={"help": "Activate the trace to record computation graphs and profiling information"}
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 7a5898f2c8f..b84d51a30b4 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -30,7 +30,7 @@ if is_tf_available():
     import tensorflow as tf
     import numpy as np
 
-    from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding
+    from transformers import tf_top_k_top_p_filtering, TFAdaptiveEmbedding, TFSharedEmbeddings
 
     if _tf_gpu_memory_limit is not None:
         gpus = tf.config.list_physical_devices("GPU")
@@ -107,26 +107,45 @@ class TFModelTesterMixin:
             and getattr(module_member, "_keras_serializable", False)
         )
         for main_layer_class in tf_main_layer_classes:
-            main_layer = main_layer_class(config)
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
             symbolic_inputs = {
                 name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
             }
+
             model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
             outputs = model(inputs_dict)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 filepath = os.path.join(tmpdirname, "keras_model.h5")
                 model.save(filepath)
-                model = tf.keras.models.load_model(
-                    filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                )
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
                 assert isinstance(model, tf.keras.Model)
                 after_outputs = model(inputs_dict)
                 self.assert_outputs_same(after_outputs, outputs)
 
     def assert_outputs_same(self, after_outputs, outputs):
         # Make sure we don't have nans
-        out_1 = after_outputs[0].numpy()
+        if isinstance(after_outputs, tf.Tensor):
+            out_1 = after_outputs.numpy()
+        else:
+            out_1 = after_outputs[0].numpy()
         out_2 = outputs[0].numpy()
         self.assertEqual(out_1.shape, out_2.shape)
         out_1 = out_1[~np.isnan(out_1)]
@@ -269,7 +288,6 @@ class TFModelTesterMixin:
             inputs_keywords = copy.deepcopy(inputs_dict)
             input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
             outputs_keywords = model(input_ids, **inputs_keywords)
-
             output_dict = outputs_dict[0].numpy()
             output_keywords = outputs_keywords[0].numpy()
 
diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py
new file mode 100644
index 00000000000..ad7d3d61aa8
--- /dev/null
+++ b/tests/test_modeling_tf_flaubert.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+
+from .utils import require_tf, slow
+
+
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFFlaubertModel
+
+
+@require_tf
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
+
+        input_ids = tf.convert_to_tensor(
+            [[0, 158, 735, 2592, 1424, 6727, 82, 1]], dtype=tf.int32,
+        )  # "J'aime flaubert !"
+
+        output = model(input_ids)[0]
+        expected_shape = tf.TensorShape((1, 8, 512))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [-1.8768773, -1.566555, 0.27072418],
+                    [-1.6920038, -0.5873505, 1.9329599],
+                    [-2.9563985, -1.6993835, 1.7972052],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/test_modeling_tf_xlm_roberta.py b/tests/test_modeling_tf_xlm_roberta.py
new file mode 100644
index 00000000000..c901eb05253
--- /dev/null
+++ b/tests/test_modeling_tf_xlm_roberta.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+
+from .utils import require_tf, slow
+
+
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFXLMRobertaModel
+
+
+@require_tf
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
+
+        features = {
+            "input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32),  # "My dog is cute"
+            "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
+        }
+
+        output = model(features)[0]
+        expected_shape = tf.TensorShape((1, 6, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [0.0681762, 0.10894451, 0.06772504],
+                    [-0.06423668, 0.02366615, 0.04329344],
+                    [-0.06057295, 0.09974135, -0.00070584],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/test_optimization_tf.py b/tests/test_optimization_tf.py
index 1ae48074c26..b4998b7e5b7 100644
--- a/tests/test_optimization_tf.py
+++ b/tests/test_optimization_tf.py
@@ -47,7 +47,7 @@ class OptimizationFTest(unittest.TestCase):
         with strategy.scope():
             accumulator = GradientAccumulator()
             variable = tf.Variable([4.0, 3.0])
-            optimizer = create_optimizer(5e-5, 10, 5)
+            optimizer, _ = create_optimizer(5e-5, 10, 5)
             gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
 
         def accumulate_on_replica(gradient):