Merge pull request #120 from EPCCed/gc_fixes

AG: update to job definitions to avoid name clashes and pod limits
EPCCed · Nov 30, 2023 · 6e08e1f · 6e08e1f
2 parents 951b823 + 7cec387
commit 6e08e1f
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 38 deletions.
diff --git a/docs/services/graphcore/training/L1_getting_started.md b/docs/services/graphcore/training/L1_getting_started.md
@@ -20,61 +20,65 @@ To get started:
     apiVersion: graphcore.ai/v1alpha1
     kind: IPUJob
     metadata:
-      name: mnist-training
+    generateName: mnist-training-
     spec:
-      # jobInstances defines the number of job instances.
-      # More than 1 job instance is usually useful for inference jobs only.
-      jobInstances: 1
-      # ipusPerJobInstance refers to the number of IPUs required per job instance.
-      # A separate IPU partition of this size will be created by the IPU Operator
-      # for each job instance.
-      ipusPerJobInstance: "1"
-      workers:
+    # jobInstances defines the number of job instances.
+    # More than 1 job instance is usually useful for inference jobs only.
+    jobInstances: 1
+    # ipusPerJobInstance refers to the number of IPUs required per job instance.
+    # A separate IPU partition of this size will be created by the IPU Operator
+    # for each job instance.
+    ipusPerJobInstance: "1"
+    workers:
         template:
-          spec:
+        spec:
             containers:
             - name: mnist-training
-              image: graphcore/pytorch:3.3.0
-              command: [/bin/bash, -c, --]
-              args:
+            image: graphcore/pytorch:3.3.0
+            command: [/bin/bash, -c, --]
+            args:
                 - |
-                  cd;
-                  mkdir build;
-                  cd build;
-                  git clone https://github.com/graphcore/examples.git;
-                  cd examples/tutorials/simple_applications/pytorch/mnist;
-                  python -m pip install -r requirements.txt;
-                  python mnist_poptorch_code_only.py --epochs 1
-              securityContext:
+                cd;
+                mkdir build;
+                cd build;
+                git clone https://github.com/graphcore/examples.git;
+                cd examples/tutorials/simple_applications/pytorch/mnist;
+                python -m pip install -r requirements.txt;
+                python mnist_poptorch_code_only.py --epochs 1
+            resources:
+                limits:
+                cpu: 32
+                memory: 200Gi
+            securityContext:
                 capabilities:
-                  add:
-                  - IPC_LOCK
-              volumeMounts:
-              - mountPath: /dev/shm
+                add:
+                - IPC_LOCK
+            volumeMounts:
+            - mountPath: /dev/shm
                 name: devshm
             restartPolicy: Never
             hostIPC: true
             volumes:
             - emptyDir:
                 medium: Memory
                 sizeLimit: 10Gi
-              name: devshm
+            name: devshm
     ```
 
 1. to submit the job - run `kubectl create -f mnist-training-ipujob.yaml`, which will give the following output:
 
     ``` bash
-    ipujob.graphcore.ai/mnist-training created
+    ipujob.graphcore.ai/mnist-training-<random string> created
     ```
 
 1. to monitor progress of the job - run `kubectl get pods`, which will give the following output
 
     ``` bash
     NAME                      READY   STATUS      RESTARTS   AGE
-    mnist-training-worker-0   0/1     Completed   0          2m56s
+    mnist-training-<random string>-worker-0   0/1     Completed   0          2m56s
     ```
 
-1. to read the result - run `kubectl logs mnist-training-worker-0`, which will give the following output (or similar)
+1. to read the result - run `kubectl logs mnist-training-<random string>-worker-0`, which will give the following output (or similar)
 
    ``` bash
    ...
@@ -93,9 +97,9 @@ NAME             STATUS      CURRENT   DESIRED   LASTMESSAGE          AGE
 mnist-training   Completed   0         1         All instances done   10m
 ```
 
-To delete the `IPUjob`, run `kubectl delete ipujobs <job-name>`, e.g. `kubectl delete ipujobs mnist-training`. This will also delete the associated worker pod `mnist-training-worker-0`.
+To delete the `IPUjob`, run `kubectl delete ipujobs <job-name>`, e.g. `kubectl delete ipujobs mnist-training-<random string>`. This will also delete the associated worker pod `mnist-training-<random string>-worker-0`.
 
-Note: simply deleting the pod via `kubectl delete pods mnist-training-worker-0` does not delete the IPU job, which will need to be deleted separately.
+Note: simply deleting the pod via `kubectl delete pods mnist-training-<random-string>-worker-0` does not delete the IPU job, which will need to be deleted separately.
 
 Note: you can list all pods via `kubectl get all` or `kubectl get pods`, but they do not show the ipujobs. These can be obtained using `kubectl get ipujobs`.
 

diff --git a/docs/services/graphcore/training/L2_multiple_IPU.md b/docs/services/graphcore/training/L2_multiple_IPU.md
@@ -14,7 +14,7 @@ To get started, save and create an IPUJob with the following `.yaml` file:
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: bert-training-multi-ipu
+  generateName: bert-training-multi-ipu-
 spec:
   jobInstances: 1
   ipusPerJobInstance: "4"
@@ -37,6 +37,10 @@ spec:
               DEBIAN_FRONTEND=noninteractive TZ='Europe/London' apt install $(< required_apt_packages.txt) -y ;
               pip3 install -r requirements.txt ;
               python3 run_pretraining.py --dataset generated --config pretrain_base_128_pod4 --training-steps 1
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
           securityContext:
             capabilities:
               add:
@@ -53,7 +57,7 @@ spec:
           name: devshm
 ```
 
-Running the above IPUJob and querying the log via `kubectl logs pod/bert-training-multi-ipu-worker-0` should give:
+Running the above IPUJob and querying the log via `kubectl logs pod/bert-training-multi-ipu-<random string>-worker-0` should give:
 
 ``` bash
 ...
@@ -162,7 +166,7 @@ In this case, [Poprun](https://docs.graphcore.ai/projects/poprun-user-guide/en/l
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: bert-poprun-64ipus
+  generateName: bert-poprun-64ipus-
 spec:
   jobInstances: 1
   modelReplicasPerWorker: "16"
@@ -196,6 +200,10 @@ spec:
               python3 run_pretraining.py \
               --config pretrain_large_128_POD64 \
               --dataset generated --training-steps 1
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
           securityContext:
             capabilities:
               add:

diff --git a/docs/services/graphcore/training/L3_profiling.md b/docs/services/graphcore/training/L3_profiling.md
@@ -18,7 +18,7 @@ Save and run `kubectl create -f <yaml-file>` on the following:
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: mnist-training-profiling
+  generateName: mnist-training-profiling-
 spec:
   jobInstances: 1
   ipusPerJobInstance: "1"
@@ -41,7 +41,24 @@ spec:
               python mnist_poptorch_code_only.py --epochs 1;
               echo 'RUNNING ls ./training';
               ls training
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: devshm
         restartPolicy: Never
+        hostIPC: true
+        volumes:
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+          name: devshm
 ```
 
 After completion, using `kubectl logs <pod-name>`, we can see the following result

diff --git a/docs/services/graphcore/training/L4_other_frameworks.md b/docs/services/graphcore/training/L4_other_frameworks.md
@@ -35,7 +35,7 @@ For a quick example, we will run an example script from <https://github.com/grap
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: tensorflow-example
+  generateName: tensorflow-example-
 spec:
   jobInstances: 1
   ipusPerJobInstance: "1"
@@ -58,7 +58,24 @@ spec:
               cd examples/tutorials/simple_applications/tensorflow2/mnist;
               python -m pip install -r requirements.txt;
               python mnist_code_only.py --epochs 1
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: devshm
         restartPolicy: Never
+        hostIPC: true
+        volumes:
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+          name: devshm
 ```
 
 Running `kubectl logs <pod>` should show the results similar to the following
@@ -97,7 +114,7 @@ For a quick example, we will run an example script from <https://github.com/grap
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: popart-example
+  generateName: popart-example-
 spec:
   jobInstances: 1
   ipusPerJobInstance: "1"
@@ -120,7 +137,24 @@ spec:
               python3 -m pip install -r requirements.txt;
               ./get_data.sh;
               python3 popart_mnist.py --epochs 1
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: devshm
         restartPolicy: Never
+        hostIPC: true
+        volumes:
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+          name: devshm
 ```
 
 Running `kubectl logs <pod>` should show the results similar to the following
@@ -166,7 +200,7 @@ For a quick example, we will run an example from <https://github.com/graphcore/e
 apiVersion: graphcore.ai/v1alpha1
 kind: IPUJob
 metadata:
-  name: poplib-example
+  generateName: poplib-example-
 spec:
   jobInstances: 1
   ipusPerJobInstance: "1"
@@ -178,7 +212,24 @@ spec:
           image: graphcore/poplar:3.3.0
           command: ["bash"]
           args: ["-c", "cd && mkdir build && cd build && git clone https://github.com/graphcore/examples.git && cd examples/tutorials/simple_applications/poplar/mnist/ && ./get_data.sh && make &&  ./regression-demo -IPU 1 50"]
+          resources:
+            limits:
+              cpu: 32
+              memory: 200Gi
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: devshm
         restartPolicy: Never
+        hostIPC: true
+        volumes:
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+          name: devshm
 ```
 
 Running `kubectl logs <pod>` should show the results similar to the following