diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst
index 36b37b3ee12..e358da0a4e9 100644
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,6 +1,14 @@
 CTRL
 ----------------------------------------------------
 
+CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
+by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
+corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+
+This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+refer to the PyTorch documentation for all matter related to general usage and behavior.
+
 Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
 you'll be able to convert from TF to our HuggingFace/Transformers format using the 
 ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).
diff --git a/src/transformers/configuration_xlm_roberta.py b/src/transformers/configuration_xlm_roberta.py
index 7fe4b642d75..330bc0d41f1 100644
--- a/src/transformers/configuration_xlm_roberta.py
+++ b/src/transformers/configuration_xlm_roberta.py
@@ -38,5 +38,6 @@ class XLMRobertaConfig(RobertaConfig):
     This class overrides :class:`~transformers.RobertaConfig`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
     model_type = "xlm-roberta"
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 71e1fbb76f9..fc766d55e45 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -607,7 +607,6 @@ class AlbertMLMHead(nn.Module):
     "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -698,7 +697,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
     ALBERT_START_DOCSTRING,
 )
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -794,7 +792,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
     ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index f9848b9411f..88644945268 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -813,7 +813,6 @@ class BertModel(BertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class BertForPreTraining(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -907,11 +906,8 @@ class BertForPreTraining(BertPreTrainedModel):
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
-)
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1018,11 +1014,9 @@ class BertForMaskedLM(BertPreTrainedModel):
 
 
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
 )
 class BertForNextSentencePrediction(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1105,7 +1099,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class BertForSequenceClassification(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1198,7 +1191,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class BertForMultipleChoice(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1294,7 +1286,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1386,7 +1377,6 @@ class BertForTokenClassification(BertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
-
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py
index 57bf9ebce5f..5aff198fae1 100644
--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -58,19 +58,20 @@ class CamembertModel(RobertaModel):
     This class overrides :class:`~transformers.RobertaModel`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """,
-    CAMEMBERT_START_DOCSTRING,
+    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForMaskedLM(RobertaForMaskedLM):
     """
     This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
@@ -85,6 +86,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
     This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
@@ -99,9 +101,11 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
     This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
+
 @add_start_docstrings(
     """CamemBERT Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
@@ -112,6 +116,6 @@ class CamembertForTokenClassification(RobertaForTokenClassification):
     This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py
index 3b1d74c34f8..942d2725645 100644
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,7 +24,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings
+from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel
 
 
@@ -184,57 +184,53 @@ class CTRLPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
 
 
-CTRL_START_DOCSTRING = r"""    CTRL model was proposed in
-    `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
-    by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-    It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
-    corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
-
+CTRL_START_DOCSTRING = r"""    
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
     refer to the PyTorch documentation for all matter related to general usage and behavior.
 
-    .. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`:
-        https://www.github.com/salesforce/ctrl
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
     Parameters:
         config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
-CTRL_INPUTS_DOCSTRING = r"""    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
-            the right rather than the left.
+CTRL_INPUTS_DOCSTRING = r"""    
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. 
+            
             Indices can be obtained using :class:`transformers.CTRLTokenizer`.
             See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            
+            `What are input IDs? <../glossary.html#input-ids>`__
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
             should not be passed as input ids as they have already been computed.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): 
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
             Indices of positions of each input sequence tokens in the position embeddings.
             Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
-            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `input_ids` indices into associated vectors
             than the model's internal embedding lookup matrix.
 """
@@ -243,35 +239,8 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
 @add_start_docstrings(
     "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
     CTRL_START_DOCSTRING,
-    CTRL_INPUTS_DOCSTRING,
 )
 class CTRLModel(CTRLPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the last layer of the model.
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
-            that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
 
     def __init__(self, config):
         super().__init__(config)
@@ -310,6 +279,7 @@ class CTRLModel(CTRLPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
+    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -320,6 +290,36 @@ class CTRLModel(CTRLPreTrainedModel):
         head_mask=None,
         inputs_embeds=None,
     ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`CTRLConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the last layer of the model.
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
+            Contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
+            should not be passed as input ids as they have already been computed.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+        model = CTRLModel.from_pretrained('ctrl')
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+        """
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -435,50 +435,10 @@ class CTRLModel(CTRLPreTrainedModel):
 
 @add_start_docstrings(
     """The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """,
+    (linear layer with weights tied to the input embeddings). """,
     CTRL_START_DOCSTRING,
-    CTRL_INPUTS_DOCSTRING,
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Language modeling loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
-            that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
-    """
 
     def __init__(self, config):
         super().__init__(config)
@@ -499,6 +459,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         inputs.update(kwargs)
         return inputs
 
+    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -510,6 +471,49 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         inputs_embeds=None,
         labels=None,
     ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            All labels set to ``-100`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`~transformers.CTRLConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
+            Language modeling loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
+            Contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
+            should not be passed as input ids as they have already been computed.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        import torch
+        from transformers import CTRLTokenizer, CTRLLMHeadModel
+
+        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+        model = CTRLLMHeadModel.from_pretrained('ctrl')
+
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
+
+        """
         transformer_outputs = self.transformer(
             input_ids,
             past=past,
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 923a6331c65..7e6d09f6863 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -390,7 +390,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertModel(DistilBertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -484,11 +483,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
 
 
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
-    DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -567,7 +564,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -645,7 +641,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -745,7 +740,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 21ab29b6c8c..728a676ea47 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -269,12 +269,6 @@ GPT2_START_DOCSTRING = r"""
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
     refer to the PyTorch documentation for all matter related to general usage and behavior.
 
-    .. _`Language Models are Unsupervised Multitask Learners`:
-        https://openai.com/blog/better-language-models/
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
     Parameters:
         config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
@@ -328,7 +322,6 @@ GPT2_INPUTS_DOCSTRING = r"""
     GPT2_START_DOCSTRING,
 )
 class GPT2Model(GPT2PreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -514,7 +507,6 @@ class GPT2Model(GPT2PreTrainedModel):
     GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = GPT2Model(config)
@@ -624,7 +616,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     GPT2_START_DOCSTRING,
 )
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         config.num_labels = 1
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index 5aaeaa01468..45b2270e4ca 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -338,7 +338,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -493,7 +492,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -587,7 +585,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 23c7b29301c..cc2c7be158f 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -150,6 +150,7 @@ class RobertaModel(BertModel):
     This class overrides :class:`~transformers.BertModel`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = RobertaConfig
     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "roberta"
@@ -167,9 +168,7 @@ class RobertaModel(BertModel):
         self.embeddings.word_embeddings = value
 
 
-@add_start_docstrings(
-    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
-)
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
 class RobertaForMaskedLM(BertPreTrainedModel):
     config_class = RobertaConfig
     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -652,7 +651,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
         )
 
         sequence_output = outputs[0]
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index bc1ab58e4c0..eaa001c39b6 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -560,7 +560,6 @@ ALBERT_INPUTS_DOCSTRING = r"""
     ALBERT_START_DOCSTRING,
 )
 class TFAlbertModel(TFAlbertPreTrainedModel):
-
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
         self.num_hidden_layers = config.num_hidden_layers
@@ -705,11 +704,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         return outputs
 
 
-@add_start_docstrings(
-    """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING
-)
+@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
@@ -766,7 +762,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
     ALBERT_START_DOCSTRING,
 )
 class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index cc473893c7c..61e20f0e39a 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -660,7 +660,6 @@ BERT_INPUTS_DOCSTRING = r"""
     BERT_START_DOCSTRING,
 )
 class TFBertModel(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.bert = TFBertMainLayer(config, name="bert")
@@ -711,7 +710,6 @@ class TFBertModel(TFBertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class TFBertForPreTraining(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -767,11 +765,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
         return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
-)
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -822,11 +817,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
 
 
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
 )
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -879,7 +872,6 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -938,7 +930,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -1049,7 +1040,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class TFBertForTokenClassification(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -1108,7 +1098,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
     BERT_START_DOCSTRING,
 )
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index 3f3eece7e23..964b6169e10 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -536,7 +536,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
     DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
@@ -594,11 +593,9 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
 
 
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
-    DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.output_attentions = config.output_attentions
@@ -665,7 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -730,7 +726,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -786,7 +781,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
     DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index 82b8b935410..d78e3f8f13f 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -444,7 +444,6 @@ GPT2_INPUTS_DOCSTRING = r"""
     GPT2_START_DOCSTRING,
 )
 class TFGPT2Model(TFGPT2PreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -494,7 +493,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
     GPT2_START_DOCSTRING,
 )
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -557,7 +555,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
     GPT2_START_DOCSTRING,
 )
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config.num_labels = 1
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index fffb70022be..36f6e36adb0 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -427,7 +427,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
     OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -473,7 +472,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
     OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -531,7 +529,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
     OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config.num_labels = 1
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index 65bc6655031..cba64e5990d 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -180,7 +180,6 @@ ROBERTA_INPUTS_DOCSTRING = r"""
     ROBERTA_START_DOCSTRING,
 )
 class TFRobertaModel(TFRobertaPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.roberta = TFRobertaMainLayer(config, name="roberta")
@@ -256,11 +255,8 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
         return x
 
 
-@add_start_docstrings(
-    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
-)
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
 class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -340,7 +336,6 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
     ROBERTA_START_DOCSTRING,
 )
 class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -394,7 +389,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
     ROBERTA_START_DOCSTRING,
 )
 class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 3e1f7a6c2a7..f05a6e5a5d0 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -687,7 +687,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
     TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@@ -737,7 +736,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
     TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = TFTransfoXLMainLayer(config, name="transformer")
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 82b32c41881..601f2d44e13 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -571,7 +571,6 @@ XLM_INPUTS_DOCSTRING = r"""
     XLM_START_DOCSTRING,
 )
 class TFXLMModel(TFXLMPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -650,7 +649,6 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
     XLM_START_DOCSTRING,
 )
 class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -705,7 +703,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     XLM_START_DOCSTRING,
 )
 class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -760,7 +757,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
     XLM_START_DOCSTRING,
 )
 class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLMMainLayer(config, name="transformer")
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index 2af6f530c63..f9c4bb18fab 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -780,7 +780,6 @@ XLNET_INPUTS_DOCSTRING = r"""
     XLNET_START_DOCSTRING,
 )
 class TFXLNetModel(TFXLNetPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -830,7 +829,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -896,7 +894,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -961,7 +958,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.num_labels = config.num_labels
@@ -1015,11 +1011,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING)
+    XLNET_START_DOCSTRING,
+)
 class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
-
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLNetMainLayer(config, name="transformer")
diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py
index 7a6902f8125..b9d45531554 100644
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -550,7 +550,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLModel(TransfoXLPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -803,7 +802,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = TransfoXLModel(config)
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index 440f8fa1fed..69b4d198aa5 100644
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -317,7 +317,6 @@ XLM_INPUTS_DOCSTRING = r"""
     XLM_START_DOCSTRING,
 )
 class XLMModel(XLMPreTrainedModel):
-
     def __init__(self, config):  # , dico, is_encoder, with_output):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -620,7 +619,6 @@ class XLMPredLayer(nn.Module):
     XLM_START_DOCSTRING,
 )
 class XLMWithLMHeadModel(XLMPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = XLMModel(config)
@@ -718,7 +716,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     XLM_START_DOCSTRING,
 )
 class XLMForSequenceClassification(XLMPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -813,7 +810,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
     XLM_START_DOCSTRING,
 )
 class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -929,7 +925,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
     XLM_START_DOCSTRING,
 )
 class XLMForQuestionAnswering(XLMPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/modeling_xlm_roberta.py b/src/transformers/modeling_xlm_roberta.py
index e9aa35ad016..1282bcea88a 100644
--- a/src/transformers/modeling_xlm_roberta.py
+++ b/src/transformers/modeling_xlm_roberta.py
@@ -61,19 +61,20 @@ class XLMRobertaModel(RobertaModel):
     This class overrides :class:`~transformers.RobertaModel`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = XLMRobertaConfig
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """,
-    XLM_ROBERTA_START_DOCSTRING,
+    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
     """
     This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = XLMRobertaConfig
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
@@ -88,6 +89,7 @@ class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = XLMRobertaConfig
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
@@ -102,6 +104,7 @@ class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
     This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = XLMRobertaConfig
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
@@ -116,5 +119,6 @@ class XLMRobertaForTokenClassification(RobertaForTokenClassification):
     This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
     superclass for the appropriate documentation alongside usage examples.
     """
+
     config_class = XLMRobertaConfig
     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index fab7dd90919..9d698423ec1 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -575,7 +575,6 @@ XLNET_INPUTS_DOCSTRING = r"""
     XLNET_START_DOCSTRING,
 )
 class XLNetModel(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -929,7 +928,6 @@ class XLNetModel(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetLMHeadModel(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.attn_type = config.attn_type
@@ -1060,7 +1058,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1161,7 +1158,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetForTokenClassification(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1262,7 +1258,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetForMultipleChoice(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1369,7 +1364,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1486,7 +1480,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
     XLNET_START_DOCSTRING,
 )
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.start_n_top = config.start_n_top