mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Merge branch 'xlnet' of https://github.com/huggingface/pytorch-pretrained-BERT into xlnet
This commit is contained in:
commit
8bb02c27e2
@ -1,3 +1,5 @@
|
||||
huggingface.css
|
||||
|
||||
/* The literal code blocks */
|
||||
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
|
||||
color: #6670FF;
|
||||
@ -31,12 +33,12 @@
|
||||
|
||||
/* When a list item that does belong to the selected block from the toc tree is hovered */
|
||||
.wy-menu-vertical li.current a:hover{
|
||||
background-color: #FB8D68;
|
||||
background-color: #B6C0FF;
|
||||
}
|
||||
|
||||
/* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
|
||||
.wy-menu-vertical li a:hover{
|
||||
background-color: #FB8D68;
|
||||
background-color: #A7AFFB;
|
||||
}
|
||||
|
||||
/* The text items on the toc tree */
|
||||
@ -114,6 +116,11 @@ a {
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
/* It would be better for table to be visible without horizontal scrolling */
|
||||
.wy-table-responsive table td, .wy-table-responsive table th{
|
||||
white-space: normal;
|
||||
}
|
||||
|
||||
.footer {
|
||||
margin-top: 20px;
|
||||
}
|
||||
@ -127,6 +134,28 @@ a {
|
||||
margin: 2px 5px 0 0;
|
||||
}
|
||||
|
||||
/* class and method names in doc */
|
||||
.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
|
||||
font-family: Calibre;
|
||||
font-size: 20px !important;
|
||||
}
|
||||
|
||||
/* class name in doc*/
|
||||
.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
|
||||
margin-right: 10px;
|
||||
font-family: Calibre-Medium;
|
||||
}
|
||||
|
||||
/* Method and class parameters */
|
||||
.sig-param{
|
||||
line-height: 23px;
|
||||
}
|
||||
|
||||
/* Class introduction "class" string at beginning */
|
||||
.rst-content dl:not(.docutils) .property{
|
||||
font-size: 18px;
|
||||
color: black;
|
||||
}
|
||||
|
||||
|
||||
/* FONTS */
|
||||
@ -167,3 +196,4 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
|
||||
src: url(./Calibre-Thin.otf);
|
||||
font-weight:400;
|
||||
}
|
||||
|
||||
|
@ -85,7 +85,9 @@ html_theme = 'sphinx_rtd_theme'
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
# html_theme_options = {}
|
||||
html_theme_options = {
|
||||
'analytics_id': 'UA-83738774-2'
|
||||
}
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
|
@ -1,3 +1,5 @@
|
||||
examples.rst
|
||||
|
||||
Examples
|
||||
================================================
|
||||
|
||||
@ -39,7 +41,13 @@ Note: To use *Distributed Training*\ , you will need to run one training script
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
|
||||
python -m torch.distributed.launch \
|
||||
--nproc_per_node=4 \
|
||||
--nnodes=2 \
|
||||
--node_rank=$THIS_MACHINE_INDEX \
|
||||
--master_addr="192.168.1.1" \
|
||||
--master_port=1234 run_bert_classifier.py \
|
||||
(--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
|
||||
|
||||
Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
|
||||
|
||||
@ -186,7 +194,19 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name MRPC --do_train --do_eval --do_lower_case --data_dir $GLUE_DIR/MRPC/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir /tmp/mrpc_output/
|
||||
python -m torch.distributed.launch \
|
||||
--nproc_per_node 8 run_bert_classifier.py \
|
||||
--bert_model bert-large-uncased-whole-word-masking \
|
||||
--task_name MRPC \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_lower_case \
|
||||
--data_dir $GLUE_DIR/MRPC/ \
|
||||
--max_seq_length 128 \
|
||||
--train_batch_size 8 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--output_dir /tmp/mrpc_output/
|
||||
|
||||
Training with these hyper-parameters gave us the following results:
|
||||
|
||||
@ -203,7 +223,20 @@ Here is an example on MNLI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --do_lower_case --data_dir /datadrive/bert_data/glue_data//MNLI/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
|
||||
python -m torch.distributed.launch \
|
||||
--nproc_per_node 8 run_bert_classifier.py \
|
||||
--bert_model bert-large-uncased-whole-word-masking \
|
||||
--task_name mnli \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_lower_case \
|
||||
--data_dir /datadrive/bert_data/glue_data//MNLI/ \
|
||||
--max_seq_length 128 \
|
||||
--train_batch_size 8 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--output_dir ../models/wwm-uncased-finetuned-mnli/ \
|
||||
--overwrite_output_dir
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@ -293,7 +326,20 @@ And here is the model provided as ``bert-large-cased-whole-word-masking-finetune
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py --bert_model bert-large-cased-whole-word-masking --do_train --do_predict --do_lower_case --train_file $SQUAD_DIR/train-v1.1.json --predict_file $SQUAD_DIR/dev-v1.1.json --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir ../models/wwm_cased_finetuned_squad/ --train_batch_size 24 --gradient_accumulation_steps 12
|
||||
python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py \
|
||||
--bert_model bert-large-cased-whole-word-masking \
|
||||
--do_train \
|
||||
--do_predict \
|
||||
--do_lower_case \
|
||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
||||
--learning_rate 3e-5 \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir ../models/wwm_cased_finetuned_squad/ \
|
||||
--train_batch_size 24 \
|
||||
--gradient_accumulation_steps 12
|
||||
|
||||
Training with these hyper-parameters gave us the following results:
|
||||
|
||||
@ -563,7 +609,18 @@ Here is an example on MNLI:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py --bert_model bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --data_dir /datadrive/bert_data/glue_data//MNLI/ --max_seq_length 128 --train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
|
||||
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
|
||||
--bert_model bert-large-uncased-whole-word-masking \
|
||||
--task_name mnli \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--data_dir /datadrive/bert_data/glue_data//MNLI/ \
|
||||
--max_seq_length 128 \
|
||||
--train_batch_size 8 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--output_dir ../models/wwm-uncased-finetuned-mnli/ \
|
||||
--overwrite_output_dir
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@ -579,4 +636,4 @@ Here is an example on MNLI:
|
||||
global_step = 18408
|
||||
loss = 0.04755385363816904
|
||||
|
||||
This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
|
||||
This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
|
||||
|
@ -1,6 +1,13 @@
|
||||
TorchScript
|
||||
================================================
|
||||
|
||||
.. note::
|
||||
This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
|
||||
with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
|
||||
releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
|
||||
with compiled TorchScript.
|
||||
|
||||
|
||||
According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
|
||||
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
|
||||
their model to be re-used in other programs, such as efficiency-oriented C++ programs.
|
||||
|
Loading…
Reference in New Issue
Block a user