From ea0e70aa92648be7176bb1fc72083bb5a0c09cbf Mon Sep 17 00:00:00 2001 From: aecelaya Date: Fri, 26 Apr 2024 16:03:32 -0500 Subject: [PATCH] Update docs for K8s --- docs/advanced_topics.md | 59 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/docs/advanced_topics.md b/docs/advanced_topics.md index b910757..51b1f7a 100644 --- a/docs/advanced_topics.md +++ b/docs/advanced_topics.md @@ -155,4 +155,61 @@ MIST computes a validation loss after each epoch on a fixed validation set, whic the training set. This validation step can add considerable time to training if you are dealing with a large dataset. To address this, we include the ```--val-percent``` (default: 0.1) and ```--val-sw-overlap``` (default: 0.25) flags to control the size (as a percent of the training set) of the validation set and the overlap between patches during validation, respectively. Both of these flags -take values that are between zero and one. \ No newline at end of file +take values that are between zero and one. + +### Kubernetes +For MD Anderson users, you can run MIST on the Kubernetes/HPC cluster. Here is an example of a job submission file: + +```yaml +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{your job name goes here}} + namespace: yn-gpu-workload + labels: + k8s-user: {{get this from your k8s-templates folder}} +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 600 + template: + spec: + nodeSelector: + "nvidia.com/gpu.present": "true" + "gpu-type": "A100" + # "gpu-type": "H100" + securityContext: + runAsUser: {{get this from your k8s-templates folder}} + runAsGroup: {{get this from your k8s-templates folder}} + fsGroup: {{get this from your k8s-templates folder}} + containers: + - name: main + image: mistmedical/mist:0.4.6a0 + command: ["/bin/bash", "-c"] + args: ["mist_run_all + --data $HOME/path/to/your/dataset.json + --numpy $HOME/path/to/your/numpy + --results $HOME/path/to/your/results"] + workingDir: {{get this from your k8s-templates folder}} + env: + - name: HOME + value: {{get this from your k8s-templates folder}} + volumeMounts: + - name: shm + mountPath: "/dev/shm" + - name: home + mountPath: {{get this from your k8s-templates folder}} + resources: + limits: + nvidia.com/gpu: "1" # change this to increase number of GPUs, max of 8 + imagePullPolicy: IfNotPresent + volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: '21474836480' + - name: home + persistentVolumeClaim: + claimName: {{get this from your k8s-templates folder}} + restartPolicy: Never +``` \ No newline at end of file