diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css index 15b5030972e..84740cb4df9 100644 --- a/docs/source/_static/css/huggingface.css +++ b/docs/source/_static/css/huggingface.css @@ -1,3 +1,5 @@ +huggingface.css + /* The literal code blocks */ .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { color: #6670FF; @@ -31,12 +33,12 @@ /* When a list item that does belong to the selected block from the toc tree is hovered */ .wy-menu-vertical li.current a:hover{ - background-color: #FB8D68; + background-color: #B6C0FF; } /* When a list item that does NOT belong to the selected block from the toc tree is hovered. */ .wy-menu-vertical li a:hover{ - background-color: #FB8D68; + background-color: #A7AFFB; } /* The text items on the toc tree */ @@ -114,6 +116,11 @@ a { text-transform: uppercase; } +/* It would be better for table to be visible without horizontal scrolling */ +.wy-table-responsive table td, .wy-table-responsive table th{ + white-space: normal; +} + .footer { margin-top: 20px; } @@ -127,6 +134,28 @@ a { margin: 2px 5px 0 0; } +/* class and method names in doc */ +.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{ + font-family: Calibre; + font-size: 20px !important; +} + +/* class name in doc*/ +.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{ + margin-right: 10px; + font-family: Calibre-Medium; +} + +/* Method and class parameters */ +.sig-param{ + line-height: 23px; +} + +/* Class introduction "class" string at beginning */ +.rst-content dl:not(.docutils) .property{ + font-size: 18px; + color: black; +} /* FONTS */ @@ -167,3 +196,4 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{ src: url(./Calibre-Thin.otf); font-weight:400; } + diff --git a/docs/source/conf.py b/docs/source/conf.py index 16866b5e5cf..79df358631e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -85,7 +85,9 @@ html_theme = 'sphinx_rtd_theme' # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = {} +html_theme_options = { + 'analytics_id': 'UA-83738774-2' +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/source/examples.rst b/docs/source/examples.rst index aee4066c2cf..51c8d850b98 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -1,3 +1,5 @@ +examples.rst + Examples ================================================ @@ -39,7 +41,13 @@ Note: To use *Distributed Training*\ , you will need to run one training script .. code-block:: bash - python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script) + python -m torch.distributed.launch \ + --nproc_per_node=4 \ + --nnodes=2 \ + --node_rank=$THIS_MACHINE_INDEX \ + --master_addr="192.168.1.1" \ + --master_port=1234 run_bert_classifier.py \ + (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script) Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``. @@ -186,7 +194,19 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word .. code-block:: bash - python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name MRPC --do_train --do_eval --do_lower_case --data_dir $GLUE_DIR/MRPC/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir /tmp/mrpc_output/ + python -m torch.distributed.launch \ + --nproc_per_node 8 run_bert_classifier.py \ + --bert_model bert-large-uncased-whole-word-masking \ + --task_name MRPC \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MRPC/ \ + --max_seq_length 128 \ + --train_batch_size 8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/mrpc_output/ Training with these hyper-parameters gave us the following results: @@ -203,7 +223,20 @@ Here is an example on MNLI: .. code-block:: bash - python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --do_lower_case --data_dir /datadrive/bert_data/glue_data//MNLI/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir + python -m torch.distributed.launch \ + --nproc_per_node 8 run_bert_classifier.py \ + --bert_model bert-large-uncased-whole-word-masking \ + --task_name mnli \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /datadrive/bert_data/glue_data//MNLI/ \ + --max_seq_length 128 \ + --train_batch_size 8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir ../models/wwm-uncased-finetuned-mnli/ \ + --overwrite_output_dir .. code-block:: bash @@ -293,7 +326,20 @@ And here is the model provided as ``bert-large-cased-whole-word-masking-finetune .. code-block:: bash - python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py --bert_model bert-large-cased-whole-word-masking --do_train --do_predict --do_lower_case --train_file $SQUAD_DIR/train-v1.1.json --predict_file $SQUAD_DIR/dev-v1.1.json --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir ../models/wwm_cased_finetuned_squad/ --train_batch_size 24 --gradient_accumulation_steps 12 + python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py \ + --bert_model bert-large-cased-whole-word-masking \ + --do_train \ + --do_predict \ + --do_lower_case \ + --train_file $SQUAD_DIR/train-v1.1.json \ + --predict_file $SQUAD_DIR/dev-v1.1.json \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir ../models/wwm_cased_finetuned_squad/ \ + --train_batch_size 24 \ + --gradient_accumulation_steps 12 Training with these hyper-parameters gave us the following results: @@ -563,7 +609,18 @@ Here is an example on MNLI: .. code-block:: bash - python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --data_dir /datadrive/bert_data/glue_data//MNLI/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir + python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \ + --bert_model bert-large-uncased-whole-word-masking \ + --task_name mnli \ + --do_train \ + --do_eval \ + --data_dir /datadrive/bert_data/glue_data//MNLI/ \ + --max_seq_length 128 \ + --train_batch_size 8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir ../models/wwm-uncased-finetuned-mnli/ \ + --overwrite_output_dir .. code-block:: bash @@ -579,4 +636,4 @@ Here is an example on MNLI: global_step = 18408 loss = 0.04755385363816904 -This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model. \ No newline at end of file +This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model. diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst index f5eb97f69d7..1b845595672 100644 --- a/docs/source/torchscript.rst +++ b/docs/source/torchscript.rst @@ -1,6 +1,13 @@ TorchScript ================================================ +.. note:: + This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities + with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming + releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes + with compiled TorchScript. + + According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code". Pytorch's two modules `JIT and TRACE `_ allow the developer to export their model to be re-used in other programs, such as efficiency-oriented C++ programs.