oob performance improvement for cpu DDP (#18595)

* oob performance improvement for cpu DDP

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* add is_psutil_available check

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi 2022-08-31 20:35:10 +08:00 committed by GitHub
parent c3be98ebab
commit cdde85a0a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -39,6 +39,7 @@ from .utils import (
ccl_version,
get_full_repo_name,
is_accelerate_available,
is_psutil_available,
is_sagemaker_dp_enabled,
is_sagemaker_mp_enabled,
is_torch_available,
@ -1342,6 +1343,21 @@ class TrainingArguments:
"Looks like distributed multinode run but MASTER_ADDR env not set, "
"please try exporting rank 0's hostname as MASTER_ADDR"
)
if (
torch.get_num_threads() == 1
and get_int_from_env(["OMP_NUM_THREADS", "MKL_NUM_THREADS"], 0) == 0
and is_psutil_available()
):
import psutil
num_cpu_threads_per_process = int(psutil.cpu_count(logical=False) / local_size)
if num_cpu_threads_per_process == 0:
num_cpu_threads_per_process = 1
torch.set_num_threads(num_cpu_threads_per_process)
logger.info(
f"num_cpu_threads_per_process unset, we set it at {num_cpu_threads_per_process} to improve oob"
" performance."
)
torch.distributed.init_process_group(backend=self.xpu_backend, rank=rank, world_size=size)
elif is_torch_tpu_available():
device = xm.xla_device()