transformers/tests/sagemaker/conftest.py
Philipp Schmid 604c085087
Sagemaker test (#10925)
* init

* first working test

* added todo for setup.py

* working test for single node multi node ddp and smd

* added tensorflow single node test

* added directory for pytorch and tensorflow due to different requirements.txt

* added directory for pytorch and tensorflow

* added comment for run_glue until it is available

* added output_dir to it

* smaller dataset to make test running faster

* adjust HP and script

* adjusted parameter for tensorflow

* refactored test scripts

* adjusted make file

* init

* first working test

* added todo for setup.py

* working test for single node multi node ddp and smd

* added tensorflow single node test

* added directory for pytorch and tensorflow due to different requirements.txt

* added directory for pytorch and tensorflow

* added comment for run_glue until it is available

* added output_dir to it

* smaller dataset to make test running faster

* adjust HP and script

* adjusted parameter for tensorflow

* refactored test scripts

* adjusted make file

* updated dlc container

* commented in all tests

* added both ecr images

* added new master branches

* debug

* added new datasets version

* init

* strange rebase bug

* removed changes

* changed min version for tests to work

* updated DLC

* added model parallel test

* removed test files

* removed test files

* tested with ned dlc

* added correct sagemaker sdk version

* adjust DLCs for official one

* reworked tests

* quality

* removed default profile added documentation to it

* added step in release for sagemaker tests

* reverted version for example script removed duplicated script and added install from master to requirements.txt

* removed mistaken .DS_Stores from mac

* fixed tests

* added Sylvains feedback

* make style

* added lysandre's feedback
2021-03-30 08:28:02 +02:00

66 lines
2.1 KiB
Python

# we define a fixture function below and it will be "used" by
# referencing its name from tests
import os
import pytest
from attr import dataclass
os.environ["AWS_DEFAULT_REGION"] = "us-east-1" # defaults region
@dataclass
class SageMakerTestEnvironment:
framework: str
role = "arn:aws:iam::558105141721:role/sagemaker_execution_role"
hyperparameters = {
"task_name": "mnli",
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"do_train": True,
"do_eval": True,
"do_predict": True,
"output_dir": "/opt/ml/model",
"overwrite_output_dir": True,
"max_steps": 500,
"save_steps": 5500,
}
distributed_hyperparameters = {**hyperparameters, "max_steps": 1000}
@property
def metric_definitions(self) -> str:
if self.framework == "pytorch":
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
]
else:
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "loss.*=\D*(.*?)]?$"},
{"Name": "eval_loss", "Regex": "sparse_categorical_accuracy.*=\D*(.*?)]?$"},
]
@property
def base_job_name(self) -> str:
return f"{self.framework}-transfromers-test"
@property
def test_path(self) -> str:
return f"./tests/sagemaker/scripts/{self.framework}"
@property
def image_uri(self) -> str:
if self.framework == "pytorch":
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04"
else:
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04"
@pytest.fixture(scope="class")
def sm_env(request):
request.cls.env = SageMakerTestEnvironment(framework=request.cls.framework)