device-agnostic deepspeed testing (#27342)

This commit is contained in:
Hz, Ji 2023-11-09 19:34:13 +08:00 committed by GitHub
parent 9999b73968
commit c5d7754b11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -38,17 +38,18 @@ from transformers.testing_utils import (
CaptureStderr, CaptureStderr,
LoggingLevel, LoggingLevel,
TestCasePlus, TestCasePlus,
backend_device_count,
execute_subprocess_async, execute_subprocess_async,
get_gpu_count,
mockenv_context, mockenv_context,
require_deepspeed, require_deepspeed,
require_optuna, require_optuna,
require_torch_gpu, require_torch_accelerator,
require_torch_multi_gpu, require_torch_multi_accelerator,
slow, slow,
torch_device,
) )
from transformers.trainer_utils import get_last_checkpoint, set_seed from transformers.trainer_utils import get_last_checkpoint, set_seed
from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_gpu_available from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device
if is_torch_available(): if is_torch_available():
@ -125,7 +126,7 @@ def get_launcher(distributed=False):
# - it won't be able to handle that # - it won't be able to handle that
# 2. for now testing with just 2 gpus max (since some quality tests may give different # 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data) # results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1 num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1
master_port = get_master_port(real_launcher=True) master_port = get_master_port(real_launcher=True)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
@ -145,7 +146,7 @@ optims = [HF_OPTIM, DS_OPTIM]
schedulers = [HF_SCHEDULER, DS_SCHEDULER] schedulers = [HF_SCHEDULER, DS_SCHEDULER]
stages = [ZERO2, ZERO3] stages = [ZERO2, ZERO3]
if is_torch_bf16_gpu_available(): if is_torch_bf16_available_on_device(torch_device):
dtypes = [FP16, BF16] dtypes = [FP16, BF16]
else: else:
dtypes = [FP16] dtypes = [FP16]
@ -165,7 +166,7 @@ params_with_optims_and_schedulers = list(itertools.product(stages, dtypes, optim
@require_deepspeed @require_deepspeed
@require_torch_gpu @require_torch_accelerator
class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
""" """
Testing non-Trainer DeepSpeed integration Testing non-Trainer DeepSpeed integration
@ -273,7 +274,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
@require_deepspeed @require_deepspeed
@require_torch_gpu @require_torch_accelerator
class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon): class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
""" """
@ -875,7 +876,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
@slow @slow
@require_deepspeed @require_deepspeed
@require_torch_gpu @require_torch_accelerator
class TestDeepSpeedWithLauncher(TestCasePlus): class TestDeepSpeedWithLauncher(TestCasePlus):
"""This class is for testing via an external script - can do multiple gpus""" """This class is for testing via an external script - can do multiple gpus"""
@ -896,7 +897,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
# #
@parameterized.expand(params, name_func=parameterized_custom_name_func) @parameterized.expand(params, name_func=parameterized_custom_name_func)
@require_torch_multi_gpu @require_torch_multi_accelerator
def test_basic_distributed(self, stage, dtype): def test_basic_distributed(self, stage, dtype):
self.run_and_check(stage=stage, dtype=dtype, distributed=True) self.run_and_check(stage=stage, dtype=dtype, distributed=True)
@ -927,7 +928,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
) )
@parameterized.expand(params, name_func=parameterized_custom_name_func) @parameterized.expand(params, name_func=parameterized_custom_name_func)
@require_torch_multi_gpu @require_torch_multi_accelerator
def test_fp32_distributed(self, stage, dtype): def test_fp32_distributed(self, stage, dtype):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done # therefore no quality checks, just basic completion checks are done
@ -968,9 +969,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
@parameterized.expand(["bf16", "fp16", "fp32"]) @parameterized.expand(["bf16", "fp16", "fp32"])
@require_torch_multi_gpu @require_torch_multi_accelerator
def test_inference(self, dtype): def test_inference(self, dtype):
if dtype == "bf16" and not is_torch_bf16_gpu_available(): if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
self.skipTest("test requires bfloat16 hardware support") self.skipTest("test requires bfloat16 hardware support")
# this is just inference, so no optimizer should be loaded # this is just inference, so no optimizer should be loaded