mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 21:30:07 +06:00

* init * first working test * added todo for setup.py * working test for single node multi node ddp and smd * added tensorflow single node test * added directory for pytorch and tensorflow due to different requirements.txt * added directory for pytorch and tensorflow * added comment for run_glue until it is available * added output_dir to it * smaller dataset to make test running faster * adjust HP and script * adjusted parameter for tensorflow * refactored test scripts * adjusted make file * init * first working test * added todo for setup.py * working test for single node multi node ddp and smd * added tensorflow single node test * added directory for pytorch and tensorflow due to different requirements.txt * added directory for pytorch and tensorflow * added comment for run_glue until it is available * added output_dir to it * smaller dataset to make test running faster * adjust HP and script * adjusted parameter for tensorflow * refactored test scripts * adjusted make file * updated dlc container * commented in all tests * added both ecr images * added new master branches * debug * added new datasets version * init * strange rebase bug * removed changes * changed min version for tests to work * updated DLC * added model parallel test * removed test files * removed test files * tested with ned dlc * added correct sagemaker sdk version * adjust DLCs for official one * reworked tests * quality * removed default profile added documentation to it * added step in release for sagemaker tests * reverted version for example script removed duplicated script and added install from master to requirements.txt * removed mistaken .DS_Stores from mac * fixed tests * added Sylvains feedback * make style * added lysandre's feedback
53 lines
1.4 KiB
Python
53 lines
1.4 KiB
Python
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
from argparse import ArgumentParser
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def parse_args():
|
|
parser = ArgumentParser()
|
|
parsed, unknown = parser.parse_known_args()
|
|
for arg in unknown:
|
|
if arg.startswith(("-", "--")):
|
|
parser.add_argument(arg.split("=")[0])
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
port = 8888
|
|
num_gpus = int(os.environ["SM_NUM_GPUS"])
|
|
hosts = json.loads(os.environ["SM_HOSTS"])
|
|
num_nodes = len(hosts)
|
|
current_host = os.environ["SM_CURRENT_HOST"]
|
|
rank = hosts.index(current_host)
|
|
os.environ["NCCL_DEBUG"] = "INFO"
|
|
|
|
if num_nodes > 1:
|
|
cmd = f"""python -m torch.distributed.launch \
|
|
--nnodes={num_nodes} \
|
|
--node_rank={rank} \
|
|
--nproc_per_node={num_gpus} \
|
|
--master_addr={hosts[0]} \
|
|
--master_port={port} \
|
|
./run_glue.py \
|
|
{"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
|
|
else:
|
|
cmd = f"""python -m torch.distributed.launch \
|
|
--nproc_per_node={num_gpus} \
|
|
./run_glue.py \
|
|
{"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
|
|
try:
|
|
subprocess.run(cmd, shell=True)
|
|
except Exception as e:
|
|
logger.info(e)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|