diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index f8527ee8870..05f1dd5da3b 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -429,6 +429,88 @@ Notes: In this example, we tell DeepSpeed to use GPU 1. + +Deployment in Notebooks +======================================================================================================================= + +The problem with notebooks is that there is no normal ``deepspeed`` launcher to rely on, so under certain setups we +have to emulate it. + +Here is how you'd have to adjust your training code in the notebook to use DeepSpeed. + +.. code-block:: python + + # DeepSpeed requires a distributed environment even when only one process is used. + # This emulates a launcher in the notebook + import os + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use + os.environ['RANK'] = "0" + os.environ['LOCAL_RANK'] = "0" + os.environ['WORLD_SIZE'] = "1" + + # Now proceed as normal, plus pass the deepspeed config file + training_args = TrainingArguments(..., deepspeed="ds_config.json") + trainer = Trainer(...) + trainer.train() + +Note: `...` stands for the normal arguments that you'd pass to the functions. + +If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated +cell with: + +.. code-block:: python + + %%bash + cat <<'EOT' > ds_config.json + { + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "zero_allow_untested_optimizer": true, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false + } + EOT + + + Configuration ======================================================================================================================= diff --git a/examples/tests/deepspeed/test_deepspeed.py b/examples/tests/deepspeed/test_deepspeed.py index 0df11ab3f8e..28f2758e9a7 100644 --- a/examples/tests/deepspeed/test_deepspeed.py +++ b/examples/tests/deepspeed/test_deepspeed.py @@ -14,13 +14,16 @@ import json import os +import sys import unittest from transformers.integrations import is_deepspeed_available from transformers.testing_utils import ( + CaptureStd, TestCasePlus, execute_subprocess_async, get_gpu_count, + mockenv, require_torch_gpu, require_torch_multi_gpu, slow, @@ -52,6 +55,20 @@ def require_deepspeed(test_case): @require_deepspeed @require_torch_gpu class TestDeepSpeed(TestCasePlus): + + # this setup emulates a notebook where a launcher needs to be emulated by hand + @mockenv(MASTER_ADDR="localhost", MASTER_PORT="109999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1") + def test_fake_notebook_no_launcher(self): + sys.path.append(self.tests_dir_str) + from test_trainer import get_regression_trainer + + del sys.path[-1] # restore + ds_config_file = f"{self.test_file_dir_str}/ds_config.json" + with CaptureStd() as cs: + trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_file) + trainer.train() + assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none" + @require_torch_multi_gpu def test_basic_distributed(self): self.run_quick(distributed=True) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5a4cf5ab784..e7619b362a8 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -239,6 +239,9 @@ class Trainer: self.hp_name = None self.deepspeed = None + # force device and distributed setup init explicitly + args._setup_devices + if model is None: if model_init is not None: self.model_init = model_init diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 32af14bd609..1054929b4e5 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -561,6 +561,12 @@ class TrainingArguments: import deepspeed deepspeed.init_distributed() + + # workaround for setups like notebooks where the launcher can't be used, + # but deepspeed requires a dist env. + # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed + self.local_rank = int(os.environ.get("LOCAL_RANK", "-1")) + device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: