From 89a0a9eaceaba3b6d25cbc109f4febd147e0aa43 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 21 Mar 2023 17:00:05 -0700 Subject: [PATCH] [deepspeed] offload + non-cpuadam optimizer exception doc (#22044) * [deepspeed] offload + non-cpuadam optimizer exception doc * deps --- docs/source/en/main_classes/deepspeed.mdx | 13 +++++++++++-- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/source/en/main_classes/deepspeed.mdx b/docs/source/en/main_classes/deepspeed.mdx index 3ee96f0b041..2b5ea6a7435 100644 --- a/docs/source/en/main_classes/deepspeed.mdx +++ b/docs/source/en/main_classes/deepspeed.mdx @@ -1293,8 +1293,17 @@ If you want to use another optimizer which is not listed above, you will have to } ``` -Similarly to `AdamW`, you can configure other officially supported optimizers. Just remember that may have different -config values. e.g. for Adam you will want `weight_decay` around `0.01`. +Similarly to `AdamW`, you can configure other officially supported optimizers. Just remember that those may have different config values. e.g. for Adam you will want `weight_decay` around `0.01`. + +Additionally, offload works the best when it's used with Deepspeed's CPU Adam optimizer. If you want to use a different optimizer with offload, since `deepspeed==0.8.3` you need to also add: + + +```json +{ + "zero_force_ds_cpu_optimizer": false +} +``` +to the top level configuration. diff --git a/setup.py b/setup.py index bddf49e4645..669c8d959ca 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,7 @@ _deps = [ "dataclasses", "datasets!=2.5.0", "decord==0.6.0", - "deepspeed>=0.6.5", + "deepspeed>=0.8.3", "dill<0.3.5", "evaluate>=0.2.0", "fairscale>0.3", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index c0cacd7596e..7c596c26683 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -12,7 +12,7 @@ deps = { "dataclasses": "dataclasses", "datasets": "datasets!=2.5.0", "decord": "decord==0.6.0", - "deepspeed": "deepspeed>=0.6.5", + "deepspeed": "deepspeed>=0.8.3", "dill": "dill<0.3.5", "evaluate": "evaluate>=0.2.0", "fairscale": "fairscale>0.3",