Add setup for TPU CI to run every hour. (#6219)

* Add setup for TPU CI to run every hour. * Re-organize config.yml Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
2025-07-14 18:18:24 +06:00 · 2020-08-07 08:17:07 -07:00 · 2020-08-07 08:17:07 -07:00 · 1b8a7ffcfd
commit 1b8a7ffcfd
parent 6695450a23
6 changed files with 253 additions and 15 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,4 +1,67 @@
-version: 2
+version: 2.1
 orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
        run:
            name: Checkout ml-testing-accelerators
            command: |
                git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
                cd ml-testing-accelerators
                git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
                git checkout stable
    build_push_docker: &build_push_docker
        run:
            name: Configure Docker
            command: |
                gcloud --quiet auth configure-docker
                cd docker/transformers-pytorch-tpu
                if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1"; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi
                docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
    deploy_cluster: &deploy_cluster
        run:
            name: Deploy the job on the kubernetes cluster
            command: |
                go get github.com/google/go-jsonnet/cmd/jsonnet && \
                export PATH=$PATH:$HOME/go/bin && \
                kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \
                job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \
                job_name=${job_name#job.batch/} && \
                job_name=${job_name% created} && \
                echo "Waiting on kubernetes job: $job_name" && \
                i=0 && \
                # 30 checks spaced 30s apart = 900s total.
                max_checks=30 && \
                status_code=2 && \
                # Check on the job periodically. Set the status code depending on what
                # happened to the job in Kubernetes. If we try max_checks times and
                # still the job hasn't finished, give up and return the starting
                # non-zero status code.
                while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
                echo "Done waiting. Job status code: $status_code" && \
                # Allow time for logs to flush.
                sleep 60 && \
                echo "JOB_NAME: $job_name" && \
                gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID && \
                echo "Done with log retrieval attempt." && \
                gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
                exit $status_code
    delete_gke_jobs: &delete_gke_jobs
        run:
            name: Delete GKE Jobs
            command: |
                # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
                # that has been around longer than 1hr. First print all columns for
                # matches, then execute the delete.
                kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}'
                kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}')
 jobs:
    run_tests_torch_and_tf:
        working_directory: ~/transformers
@ -50,7 +113,6 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/output.txt
                  destination: test_output.txt
    run_tests_tf:
        working_directory: ~/transformers
        docker:
@ -193,6 +255,35 @@ jobs:
            - checkout
            - run: pip install requests
            - run: python ./utils/link_tester.py
 # TPU JOBS
    run_examples_tpu:
        docker:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - go/install
            - *checkout_ml_testing
            - gcp-gke/install
            - gcp-gke/update-kubeconfig-with-credentials:
                  cluster: $GKE_CLUSTER
                  perform-login: true
            - setup_remote_docker
            - *build_push_docker
            - *deploy_cluster
    cleanup-gke-jobs:
        docker:
            - image: circleci/python:3.6
        steps:
            - gcp-gke/install
            - gcp-gke/update-kubeconfig-with-credentials:
                  cluster: $GKE_CLUSTER
                  perform-login: true
            - *delete_gke_jobs
 workflow_filters: &workflow_filters
    filters:
        branches:
@ -211,3 +302,15 @@ workflows:
            - run_tests_tf
            - build_doc
            - deploy_doc: *workflow_filters
    tpu_testing_jobs:
        triggers:
            - schedule:
                # Set to run at the first minute of every hour.
                cron: "0 8 * * *"
                filters:
                    branches:
                        only:
                            - master
        jobs:
            - cleanup-gke-jobs
            - run_examples_tpu
--- a/docker/transformers-pytorch-tpu/Dockerfile
+++ b/docker/transformers-pytorch-tpu/Dockerfile
@ -0,0 +1,65 @@
 FROM google/cloud-sdk:slim
 # Build args.
 ARG GITHUB_REF=refs/heads/master
 # TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
 # wheels available; see below.
 ENV PYTHON_VERSION=3.6
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cmake \
         git \
         curl \
         ca-certificates
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
 RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
    chmod +x ~/miniconda.sh && \
    ~/miniconda.sh -b && \
    rm ~/miniconda.sh
 ENV PATH=/root/miniconda3/bin:$PATH
 RUN conda create -y --name container python=$PYTHON_VERSION
 # Run the rest of commands within the new conda env.
 # Use absolute path to appease Codefactor.
 SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
 RUN conda install -y python=$PYTHON_VERSION mkl
 RUN pip uninstall -y torch && \
    # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
    gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
    gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
    gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
    pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
    apt-get install -y libomp5
 ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
 # Install huggingface/transformers at the current PR, plus dependencies.
 RUN git clone https://github.com/huggingface/transformers.git && \
    cd transformers && \
    git fetch origin $GITHUB_REF:CI && \
    git checkout CI && \
    cd .. && \
    pip install ./transformers && \
    pip install -r ./transformers/examples/requirements.txt && \
    pip install pytest
 RUN python -c "import torch_xla; print(torch_xla.__version__)"
 RUN python -c "import transformers as trf; print(trf.__version__)"
 RUN conda init bash
 COPY docker-entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
 ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
 CMD ["bash"]
--- a/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
+++ b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
@ -0,0 +1,38 @@
 local base = import 'templates/base.libsonnet';
 local tpus = import 'templates/tpus.libsonnet';
 local utils = import "templates/utils.libsonnet";
 local volumes = import "templates/volumes.libsonnet";
 local bertBaseCased = base.BaseTest {
  frameworkPrefix: "hf",
  modelName: "bert-base-cased",
  mode: "example",
  configMaps: [],
  timeout: 3600, # 1 hour, in seconds
  image: std.extVar('image'),
  imageTag: std.extVar('image-tag'),
  tpuSettings+: {
    softwareVersion: "pytorch-nightly",
  },
  accelerator: tpus.v3_8,
  volumeMap+: {
    datasets: volumes.PersistentVolumeSpec {
      name: "huggingface-cluster-disk",
      mountPath: "/datasets",
    },
  },
  command: utils.scriptCommand(
    |||
      python -m pytest -s transformers/examples/test_xla_examples.py -v
      test_exit_code=$?
      echo "\nFinished running commands.\n"
      test $test_exit_code -eq 0
    |||
  ),
 };
 bertBaseCased.oneshotJob
--- a/docker/transformers-pytorch-tpu/dataset.yaml
+++ b/docker/transformers-pytorch-tpu/dataset.yaml
@ -0,0 +1,32 @@
 apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: huggingface-cluster-disk
 spec:
  storageClassName: ""
  capacity:
    storage: 500Gi
  accessModes:
    - ReadOnlyMany
  claimRef:
    namespace: default
    name: huggingface-cluster-disk-claim
  gcePersistentDisk:
    pdName: huggingface-cluster-disk
    fsType: ext4
    readOnly: true
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: huggingface-cluster-disk-claim
 spec:
  # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
  # A nil storageClassName value uses the default StorageClass. For details, see
  # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
  storageClassName: ""
  accessModes:
    - ReadOnlyMany
  resources:
    requests:
      storage: 1Ki
--- a/docker/transformers-pytorch-tpu/docker-entrypoint.sh
+++ b/docker/transformers-pytorch-tpu/docker-entrypoint.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 source ~/.bashrc
 echo "running docker-entrypoint.sh"
 conda activate container
 echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
 echo "printed TPU info"
 export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
 exec "$@"#!/bin/bash
--- a/examples/test_xla_examples.py
+++ b/examples/test_xla_examples.py
@ -14,7 +14,6 @@
 # limitations under the License.
 import argparse
 import logging
 import sys
 import unittest
@ -29,13 +28,6 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger()
 def get_setup_file():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f")
    args = parser.parse_args()
    return args.f
@require_torch_tpu
 class TorchXLAExamplesTests(unittest.TestCase):
    def test_run_glue(self):
@ -47,13 +39,13 @@ class TorchXLAExamplesTests(unittest.TestCase):
        output_directory = "run_glue_output"
        testargs = f"""
-            text-classification/run_glue.py
+            transformers/examples/text-classification/run_glue.py
            --num_cores=8
-            text-classification/run_glue.py
+            transformers/examples/text-classification/run_glue.py
            --do_train
            --do_eval
            --task_name=MRPC
-            --data_dir=../glue_data/MRPC
+            --data_dir=/datasets/glue_data/MRPC
            --cache_dir=./cache_dir
            --num_train_epochs=1
            --max_seq_length=128
@ -87,5 +79,5 @@ class TorchXLAExamplesTests(unittest.TestCase):
                # Assert that the model trains
                self.assertGreaterEqual(value, 0.70)
-            # Assert that the script takes less than 100 seconds to make sure it doesn't hang.
+            # Assert that the script takes less than 300 seconds to make sure it doesn't hang.
-            self.assertLess(end - start, 100)
+            self.assertLess(end - start, 300)