From 57eb1cb68d1c567b25ac256444e5c1a77b8817a7 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 3 Aug 2020 18:22:31 -0400 Subject: [PATCH] [s2s] Document better mbart finetuning command (#6229) * Document better MT command * improve multigpu command --- examples/seq2seq/README.md | 10 ++++------ examples/seq2seq/train_mbart_cc25_enro.sh | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 9d12dc33481..dd026784169 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -113,22 +113,20 @@ Best performing command: # optionally export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional -export MAX_LEN=200 +export MAX_LEN=128 export BS=4 -export GAS=8 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --label_smoothing 0.1 --fp16_opt_level=O1 --logger_name wandb --sortish_sampler ``` -This should take < 6h/epoch on a 16GB v100 and achieve val_avg_ BLEU score above 25. (you can see metrics in wandb or metrics.json). -To get results in line with fairseq, you need to do some postprocessing. +This should take < 6h/epoch on a 16GB v100 and achieve test BLEU above 26 +To get results in line with fairseq, you need to do some postprocessing. (see `romanian_postprocessing.md`) MultiGPU command (using 8 GPUS as an example) ```bash export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional -export MAX_LEN=200 +export MAX_LEN=128 export BS=4 -export GAS=1 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --gpus 8 --logger_name wandb ``` ### Finetuning Outputs diff --git a/examples/seq2seq/train_mbart_cc25_enro.sh b/examples/seq2seq/train_mbart_cc25_enro.sh index b8122aee3f4..90bfce3e94d 100755 --- a/examples/seq2seq/train_mbart_cc25_enro.sh +++ b/examples/seq2seq/train_mbart_cc25_enro.sh @@ -10,7 +10,7 @@ python finetune.py \ --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ --data_dir $ENRO_DIR \ --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ - --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \ + --train_batch_size=$BS --eval_batch_size=$BS \ --task translation \ --warmup_steps 500 \ --freeze_embeds \