mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-14 18:18:24 +06:00
Add setup for TPU CI to run every hour. (#6219)
* Add setup for TPU CI to run every hour. * Re-organize config.yml Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
parent
6695450a23
commit
1b8a7ffcfd
@ -1,4 +1,67 @@
|
|||||||
version: 2
|
version: 2.1
|
||||||
|
orbs:
|
||||||
|
gcp-gke: circleci/gcp-gke@1.0.4
|
||||||
|
go: circleci/go@1.3.0
|
||||||
|
|
||||||
|
# TPU REFERENCES
|
||||||
|
references:
|
||||||
|
checkout_ml_testing: &checkout_ml_testing
|
||||||
|
run:
|
||||||
|
name: Checkout ml-testing-accelerators
|
||||||
|
command: |
|
||||||
|
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
|
||||||
|
cd ml-testing-accelerators
|
||||||
|
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
|
||||||
|
git checkout stable
|
||||||
|
build_push_docker: &build_push_docker
|
||||||
|
run:
|
||||||
|
name: Configure Docker
|
||||||
|
command: |
|
||||||
|
gcloud --quiet auth configure-docker
|
||||||
|
cd docker/transformers-pytorch-tpu
|
||||||
|
if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1"; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi
|
||||||
|
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
|
||||||
|
deploy_cluster: &deploy_cluster
|
||||||
|
run:
|
||||||
|
name: Deploy the job on the kubernetes cluster
|
||||||
|
command: |
|
||||||
|
go get github.com/google/go-jsonnet/cmd/jsonnet && \
|
||||||
|
export PATH=$PATH:$HOME/go/bin && \
|
||||||
|
kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \
|
||||||
|
job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \
|
||||||
|
job_name=${job_name#job.batch/} && \
|
||||||
|
job_name=${job_name% created} && \
|
||||||
|
echo "Waiting on kubernetes job: $job_name" && \
|
||||||
|
i=0 && \
|
||||||
|
# 30 checks spaced 30s apart = 900s total.
|
||||||
|
max_checks=30 && \
|
||||||
|
status_code=2 && \
|
||||||
|
# Check on the job periodically. Set the status code depending on what
|
||||||
|
# happened to the job in Kubernetes. If we try max_checks times and
|
||||||
|
# still the job hasn't finished, give up and return the starting
|
||||||
|
# non-zero status code.
|
||||||
|
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
|
||||||
|
echo "Done waiting. Job status code: $status_code" && \
|
||||||
|
# Allow time for logs to flush.
|
||||||
|
sleep 60 && \
|
||||||
|
echo "JOB_NAME: $job_name" && \
|
||||||
|
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID && \
|
||||||
|
echo "Done with log retrieval attempt." && \
|
||||||
|
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
|
||||||
|
exit $status_code
|
||||||
|
delete_gke_jobs: &delete_gke_jobs
|
||||||
|
run:
|
||||||
|
name: Delete GKE Jobs
|
||||||
|
command: |
|
||||||
|
# Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
|
||||||
|
# that has been around longer than 1hr. First print all columns for
|
||||||
|
# matches, then execute the delete.
|
||||||
|
kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}'
|
||||||
|
kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_tests_torch_and_tf:
|
run_tests_torch_and_tf:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
@ -50,7 +113,6 @@ jobs:
|
|||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/output.txt
|
path: ~/transformers/output.txt
|
||||||
destination: test_output.txt
|
destination: test_output.txt
|
||||||
|
|
||||||
run_tests_tf:
|
run_tests_tf:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -193,6 +255,35 @@ jobs:
|
|||||||
- checkout
|
- checkout
|
||||||
- run: pip install requests
|
- run: pip install requests
|
||||||
- run: python ./utils/link_tester.py
|
- run: python ./utils/link_tester.py
|
||||||
|
|
||||||
|
# TPU JOBS
|
||||||
|
run_examples_tpu:
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.6
|
||||||
|
environment:
|
||||||
|
OMP_NUM_THREADS: 1
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- go/install
|
||||||
|
- *checkout_ml_testing
|
||||||
|
- gcp-gke/install
|
||||||
|
- gcp-gke/update-kubeconfig-with-credentials:
|
||||||
|
cluster: $GKE_CLUSTER
|
||||||
|
perform-login: true
|
||||||
|
- setup_remote_docker
|
||||||
|
- *build_push_docker
|
||||||
|
- *deploy_cluster
|
||||||
|
cleanup-gke-jobs:
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.6
|
||||||
|
steps:
|
||||||
|
- gcp-gke/install
|
||||||
|
- gcp-gke/update-kubeconfig-with-credentials:
|
||||||
|
cluster: $GKE_CLUSTER
|
||||||
|
perform-login: true
|
||||||
|
- *delete_gke_jobs
|
||||||
workflow_filters: &workflow_filters
|
workflow_filters: &workflow_filters
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
@ -211,3 +302,15 @@ workflows:
|
|||||||
- run_tests_tf
|
- run_tests_tf
|
||||||
- build_doc
|
- build_doc
|
||||||
- deploy_doc: *workflow_filters
|
- deploy_doc: *workflow_filters
|
||||||
|
tpu_testing_jobs:
|
||||||
|
triggers:
|
||||||
|
- schedule:
|
||||||
|
# Set to run at the first minute of every hour.
|
||||||
|
cron: "0 8 * * *"
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
jobs:
|
||||||
|
- cleanup-gke-jobs
|
||||||
|
- run_examples_tpu
|
||||||
|
65
docker/transformers-pytorch-tpu/Dockerfile
Normal file
65
docker/transformers-pytorch-tpu/Dockerfile
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
FROM google/cloud-sdk:slim
|
||||||
|
|
||||||
|
# Build args.
|
||||||
|
ARG GITHUB_REF=refs/heads/master
|
||||||
|
|
||||||
|
# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
|
||||||
|
# wheels available; see below.
|
||||||
|
ENV PYTHON_VERSION=3.6
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
ca-certificates
|
||||||
|
|
||||||
|
# Install conda and python.
|
||||||
|
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
|
||||||
|
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
|
||||||
|
chmod +x ~/miniconda.sh && \
|
||||||
|
~/miniconda.sh -b && \
|
||||||
|
rm ~/miniconda.sh
|
||||||
|
|
||||||
|
ENV PATH=/root/miniconda3/bin:$PATH
|
||||||
|
|
||||||
|
RUN conda create -y --name container python=$PYTHON_VERSION
|
||||||
|
|
||||||
|
# Run the rest of commands within the new conda env.
|
||||||
|
# Use absolute path to appease Codefactor.
|
||||||
|
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
|
||||||
|
RUN conda install -y python=$PYTHON_VERSION mkl
|
||||||
|
|
||||||
|
RUN pip uninstall -y torch && \
|
||||||
|
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
|
||||||
|
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||||
|
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||||
|
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||||
|
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||||
|
apt-get install -y libomp5
|
||||||
|
|
||||||
|
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
|
||||||
|
|
||||||
|
|
||||||
|
# Install huggingface/transformers at the current PR, plus dependencies.
|
||||||
|
RUN git clone https://github.com/huggingface/transformers.git && \
|
||||||
|
cd transformers && \
|
||||||
|
git fetch origin $GITHUB_REF:CI && \
|
||||||
|
git checkout CI && \
|
||||||
|
cd .. && \
|
||||||
|
pip install ./transformers && \
|
||||||
|
pip install -r ./transformers/examples/requirements.txt && \
|
||||||
|
pip install pytest
|
||||||
|
|
||||||
|
RUN python -c "import torch_xla; print(torch_xla.__version__)"
|
||||||
|
RUN python -c "import transformers as trf; print(trf.__version__)"
|
||||||
|
RUN conda init bash
|
||||||
|
COPY docker-entrypoint.sh /usr/local/bin/
|
||||||
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||||
|
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
||||||
|
CMD ["bash"]
|
38
docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
Normal file
38
docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
local base = import 'templates/base.libsonnet';
|
||||||
|
local tpus = import 'templates/tpus.libsonnet';
|
||||||
|
local utils = import "templates/utils.libsonnet";
|
||||||
|
local volumes = import "templates/volumes.libsonnet";
|
||||||
|
|
||||||
|
local bertBaseCased = base.BaseTest {
|
||||||
|
frameworkPrefix: "hf",
|
||||||
|
modelName: "bert-base-cased",
|
||||||
|
mode: "example",
|
||||||
|
configMaps: [],
|
||||||
|
|
||||||
|
timeout: 3600, # 1 hour, in seconds
|
||||||
|
|
||||||
|
image: std.extVar('image'),
|
||||||
|
imageTag: std.extVar('image-tag'),
|
||||||
|
|
||||||
|
tpuSettings+: {
|
||||||
|
softwareVersion: "pytorch-nightly",
|
||||||
|
},
|
||||||
|
accelerator: tpus.v3_8,
|
||||||
|
|
||||||
|
volumeMap+: {
|
||||||
|
datasets: volumes.PersistentVolumeSpec {
|
||||||
|
name: "huggingface-cluster-disk",
|
||||||
|
mountPath: "/datasets",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
command: utils.scriptCommand(
|
||||||
|
|||
|
||||||
|
python -m pytest -s transformers/examples/test_xla_examples.py -v
|
||||||
|
test_exit_code=$?
|
||||||
|
echo "\nFinished running commands.\n"
|
||||||
|
test $test_exit_code -eq 0
|
||||||
|
|||
|
||||||
|
),
|
||||||
|
};
|
||||||
|
|
||||||
|
bertBaseCased.oneshotJob
|
32
docker/transformers-pytorch-tpu/dataset.yaml
Normal file
32
docker/transformers-pytorch-tpu/dataset.yaml
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolume
|
||||||
|
metadata:
|
||||||
|
name: huggingface-cluster-disk
|
||||||
|
spec:
|
||||||
|
storageClassName: ""
|
||||||
|
capacity:
|
||||||
|
storage: 500Gi
|
||||||
|
accessModes:
|
||||||
|
- ReadOnlyMany
|
||||||
|
claimRef:
|
||||||
|
namespace: default
|
||||||
|
name: huggingface-cluster-disk-claim
|
||||||
|
gcePersistentDisk:
|
||||||
|
pdName: huggingface-cluster-disk
|
||||||
|
fsType: ext4
|
||||||
|
readOnly: true
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: huggingface-cluster-disk-claim
|
||||||
|
spec:
|
||||||
|
# Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
|
||||||
|
# A nil storageClassName value uses the default StorageClass. For details, see
|
||||||
|
# https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
|
||||||
|
storageClassName: ""
|
||||||
|
accessModes:
|
||||||
|
- ReadOnlyMany
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Ki
|
8
docker/transformers-pytorch-tpu/docker-entrypoint.sh
Normal file
8
docker/transformers-pytorch-tpu/docker-entrypoint.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source ~/.bashrc
|
||||||
|
echo "running docker-entrypoint.sh"
|
||||||
|
conda activate container
|
||||||
|
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
|
||||||
|
echo "printed TPU info"
|
||||||
|
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
|
||||||
|
exec "$@"#!/bin/bash
|
@ -14,7 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
@ -29,13 +28,6 @@ logging.basicConfig(level=logging.DEBUG)
|
|||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
def get_setup_file():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-f")
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args.f
|
|
||||||
|
|
||||||
|
|
||||||
@require_torch_tpu
|
@require_torch_tpu
|
||||||
class TorchXLAExamplesTests(unittest.TestCase):
|
class TorchXLAExamplesTests(unittest.TestCase):
|
||||||
def test_run_glue(self):
|
def test_run_glue(self):
|
||||||
@ -47,13 +39,13 @@ class TorchXLAExamplesTests(unittest.TestCase):
|
|||||||
output_directory = "run_glue_output"
|
output_directory = "run_glue_output"
|
||||||
|
|
||||||
testargs = f"""
|
testargs = f"""
|
||||||
text-classification/run_glue.py
|
transformers/examples/text-classification/run_glue.py
|
||||||
--num_cores=8
|
--num_cores=8
|
||||||
text-classification/run_glue.py
|
transformers/examples/text-classification/run_glue.py
|
||||||
--do_train
|
--do_train
|
||||||
--do_eval
|
--do_eval
|
||||||
--task_name=MRPC
|
--task_name=MRPC
|
||||||
--data_dir=../glue_data/MRPC
|
--data_dir=/datasets/glue_data/MRPC
|
||||||
--cache_dir=./cache_dir
|
--cache_dir=./cache_dir
|
||||||
--num_train_epochs=1
|
--num_train_epochs=1
|
||||||
--max_seq_length=128
|
--max_seq_length=128
|
||||||
@ -87,5 +79,5 @@ class TorchXLAExamplesTests(unittest.TestCase):
|
|||||||
# Assert that the model trains
|
# Assert that the model trains
|
||||||
self.assertGreaterEqual(value, 0.70)
|
self.assertGreaterEqual(value, 0.70)
|
||||||
|
|
||||||
# Assert that the script takes less than 100 seconds to make sure it doesn't hang.
|
# Assert that the script takes less than 300 seconds to make sure it doesn't hang.
|
||||||
self.assertLess(end - start, 100)
|
self.assertLess(end - start, 300)
|
||||||
|
Loading…
Reference in New Issue
Block a user