diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 15b5030972e..84740cb4df9 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,3 +1,5 @@
+huggingface.css
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
     color: #6670FF;
@@ -31,12 +33,12 @@
 
 /* When a list item that does belong to the selected block from the toc tree is hovered */
 .wy-menu-vertical li.current a:hover{
-    background-color: #FB8D68;
+    background-color: #B6C0FF;
 }
 
 /* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
 .wy-menu-vertical li a:hover{
-    background-color: #FB8D68;
+    background-color: #A7AFFB;
 }
 
 /* The text items on the toc tree */
@@ -114,6 +116,11 @@ a {
     text-transform: uppercase;
 }
 
+/* It would be better for table to be visible without horizontal scrolling */
+.wy-table-responsive table td, .wy-table-responsive table th{
+    white-space: normal;
+}
+
 .footer {
     margin-top: 20px;
 }
@@ -127,6 +134,28 @@ a {
     margin: 2px 5px 0 0;
 }
 
+/* class and method names in doc */
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
+    font-family: Calibre;
+    font-size: 20px !important;
+}
+
+/* class name in doc*/
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
+    margin-right: 10px;
+    font-family: Calibre-Medium;
+}
+
+/* Method and class parameters */
+.sig-param{
+    line-height: 23px;
+}
+
+/* Class introduction "class" string at beginning */
+.rst-content dl:not(.docutils) .property{
+    font-size: 18px;
+    color: black;
+}
 
 
 /* FONTS */
@@ -167,3 +196,4 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
     src: url(./Calibre-Thin.otf);
     font-weight:400;
 }
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 16866b5e5cf..79df358631e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -85,7 +85,9 @@ html_theme = 'sphinx_rtd_theme'
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'analytics_id': 'UA-83738774-2'
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index aee4066c2cf..51c8d850b98 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -1,3 +1,5 @@
+examples.rst
+
 Examples
 ================================================
 
@@ -39,7 +41,13 @@ Note: To use *Distributed Training*\ , you will need to run one training script
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
+    python -m torch.distributed.launch \
+        --nproc_per_node=4 \
+        --nnodes=2 \
+        --node_rank=$THIS_MACHINE_INDEX \
+        --master_addr="192.168.1.1" \
+        --master_port=1234 run_bert_classifier.py \
+        (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 
 Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
 
@@ -186,7 +194,19 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name MRPC \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir $GLUE_DIR/MRPC/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+         --output_dir /tmp/mrpc_output/
 
 Training with these hyper-parameters gave us the following results:
 
@@ -203,7 +223,20 @@ Here is an example on MNLI:
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
 
 .. code-block:: bash
 
@@ -293,7 +326,20 @@ And here is the model provided as ``bert-large-cased-whole-word-masking-finetune
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+    python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py \
+        --bert_model bert-large-cased-whole-word-masking \
+        --do_train \
+        --do_predict \
+        --do_lower_case \
+        --train_file $SQUAD_DIR/train-v1.1.json \
+        --predict_file $SQUAD_DIR/dev-v1.1.json \
+        --learning_rate 3e-5 \
+        --num_train_epochs 2 \
+        --max_seq_length 384 \
+        --doc_stride 128 \
+        --output_dir ../models/wwm_cased_finetuned_squad/ \
+        --train_batch_size 24 \
+        --gradient_accumulation_steps 12
 
 Training with these hyper-parameters gave us the following results:
 
@@ -563,7 +609,18 @@ Here is an example on MNLI:
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+    python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
 
 .. code-block:: bash
 
@@ -579,4 +636,4 @@ Here is an example on MNLI:
      global_step = 18408
      loss = 0.04755385363816904
 
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
\ No newline at end of file
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index f5eb97f69d7..1b845595672 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -1,6 +1,13 @@
 TorchScript
 ================================================
 
+.. note::
+    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
+    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
+    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
+    with compiled TorchScript.
+
+
 According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.