mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Update the distributed CPU training on Kubernetes documentation (#32669)
* Update the Kubernetes CPU training example * Add namespace arg Signed-off-by: Dina Suehiro Jones <dina.s.jones@intel.com> --------- Signed-off-by: Dina Suehiro Jones <dina.s.jones@intel.com>
This commit is contained in:
parent
20a04497a8
commit
6577c77d93
@ -155,13 +155,20 @@ This example assumes that you have:
|
||||
The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
|
||||
extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
|
||||
```dockerfile
|
||||
FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9
|
||||
FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
google-perftools \
|
||||
libomp-dev
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Download and extract the transformers code
|
||||
ARG HF_TRANSFORMERS_VER="4.35.2"
|
||||
RUN mkdir transformers && \
|
||||
ARG HF_TRANSFORMERS_VER="4.44.0"
|
||||
RUN pip install --no-cache-dir \
|
||||
transformers==${HF_TRANSFORMERS_VER} && \
|
||||
mkdir transformers && \
|
||||
curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf -
|
||||
```
|
||||
The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the
|
||||
@ -189,7 +196,6 @@ apiVersion: "kubeflow.org/v1"
|
||||
kind: PyTorchJob
|
||||
metadata:
|
||||
name: transformers-pytorchjob
|
||||
namespace: kubeflow
|
||||
spec:
|
||||
elasticPolicy:
|
||||
rdzvBackend: c10d
|
||||
@ -206,32 +212,27 @@ spec:
|
||||
- name: pytorch
|
||||
image: <image name>:<tag> # Specify the docker image to use for the worker pods
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- torchrun
|
||||
- /workspace/transformers/examples/pytorch/question-answering/run_qa.py
|
||||
- --model_name_or_path
|
||||
- "google-bert/bert-large-uncased"
|
||||
- --dataset_name
|
||||
- "squad"
|
||||
- --do_train
|
||||
- --do_eval
|
||||
- --per_device_train_batch_size
|
||||
- "12"
|
||||
- --learning_rate
|
||||
- "3e-5"
|
||||
- --num_train_epochs
|
||||
- "2"
|
||||
- --max_seq_length
|
||||
- "384"
|
||||
- --doc_stride
|
||||
- "128"
|
||||
- --output_dir
|
||||
- "/tmp/pvc-mount/output"
|
||||
- --no_cuda
|
||||
- --ddp_backend
|
||||
- "ccl"
|
||||
- --use_ipex
|
||||
- --bf16 # Specify --bf16 if your hardware supports bfloat16
|
||||
command: ["/bin/bash", "-c"]
|
||||
args:
|
||||
- >-
|
||||
cd /workspace/transformers;
|
||||
pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt;
|
||||
source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh;
|
||||
torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \
|
||||
--model_name_or_path distilbert/distilbert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--per_device_train_batch_size 12 \
|
||||
--learning_rate 3e-5 \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \
|
||||
--no_cuda \
|
||||
--ddp_backend ccl \
|
||||
--bf16 \
|
||||
--use_ipex;
|
||||
env:
|
||||
- name: LD_PRELOAD
|
||||
value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so"
|
||||
@ -244,13 +245,13 @@ spec:
|
||||
- name: CCL_WORKER_COUNT
|
||||
value: "1"
|
||||
- name: OMP_NUM_THREADS # Can be tuned for optimal performance
|
||||
- value: "56"
|
||||
value: "240"
|
||||
resources:
|
||||
limits:
|
||||
cpu: 200 # Update the CPU and memory limit values based on your nodes
|
||||
cpu: 240 # Update the CPU and memory limit values based on your nodes
|
||||
memory: 128Gi
|
||||
requests:
|
||||
cpu: 200 # Update the CPU and memory request values based on your nodes
|
||||
cpu: 240 # Update the CPU and memory request values based on your nodes
|
||||
memory: 128Gi
|
||||
volumeMounts:
|
||||
- name: pvc-volume
|
||||
@ -258,8 +259,8 @@ spec:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
restartPolicy: Never
|
||||
nodeSelector: # Optionally use the node selector to specify what types of nodes to use for the workers
|
||||
node-type: spr
|
||||
nodeSelector: # Optionally use nodeSelector to match a certain node label for the worker pods
|
||||
node-type: gnr
|
||||
volumes:
|
||||
- name: pvc-volume
|
||||
persistentVolumeClaim:
|
||||
@ -287,10 +288,12 @@ set the same CPU and memory amounts for both the resource limits and requests.
|
||||
After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
|
||||
to the cluster using:
|
||||
```bash
|
||||
kubectl create -f pytorchjob.yaml
|
||||
export NAMESPACE=<specify your namespace>
|
||||
|
||||
kubectl create -f pytorchjob.yaml -n ${NAMESPACE}
|
||||
```
|
||||
|
||||
The `kubectl get pods -n kubeflow` command can then be used to list the pods in the `kubeflow` namespace. You should see
|
||||
The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see
|
||||
the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as
|
||||
the containers get pulled and created, then the status should change to "Running".
|
||||
```
|
||||
@ -303,13 +306,13 @@ transformers-pytorchjob-worker-3 1/1 Running
|
||||
...
|
||||
```
|
||||
|
||||
The logs for worker can be viewed using `kubectl logs -n kubeflow <pod name>`. Add `-f` to stream the logs, for example:
|
||||
The logs for worker can be viewed using `kubectl logs <pod name> -n ${NAMESPACE}`. Add `-f` to stream the logs, for example:
|
||||
```bash
|
||||
kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f
|
||||
kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f
|
||||
```
|
||||
|
||||
After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
|
||||
with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml`.
|
||||
with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`.
|
||||
|
||||
## Summary
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user