diff --git a/.github/workflows/docker_build.yml b/.github/workflows/docker_build.yml index 63edc37..fd13a53 100644 --- a/.github/workflows/docker_build.yml +++ b/.github/workflows/docker_build.yml @@ -2,16 +2,9 @@ name: build on: push: - # below: trying to avoid builds for changes that don't affect the environment definition, since they are - # long-running & my GH Actions usage is limited. The repo is installed in edit mode in the container definition, but - # this means the experiment needs to clone the appropriate revision into place before starting, or it risks running - # stale code. - paths: - - 'k8s/Dockerfile' - - 'setup.cfg' - - 'pyproject.toml' - - '.github/workflows/docker_build.yml' - + # Don't run if we didn't change any code or environment definitions + paths-ignore: + - '**/*.md' jobs: docker: runs-on: ubuntu-latest diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000..b27084d --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,36 @@ +# Kubernetes Examples + +In [the PEFT/StarCoder example](../../src/hf_libraries_demo/experiments/peft), I (try to) optimize performance for use +with a single A100 GPU. For me to access an A100, I have to do so via the +[Nautilus cluster](https://portal.nrp-nautilus.io/). + +Here is my kubernetes & docker setups for doing this, some parts may be useful for your own or not. + +## Bare minimum k8s job + +Since some examples use containers, I assembled an extremely simple job that can be run with +this repo packaged as a Docker container in [`demo_job.yml`](./demo_job.yml). It executes a function defined in +[the packaging demo](../src/hf_libraries_demo/package_demo) to print a formatted string. + +1. [Dockerfile](./Dockerfile) defines the image. +2. [Github Action](../.github/workflows/docker_build.yml) builds the image and pushes to DockerHub/Nautilus Gitlab +container registry. +3. [Demo Job](demo_job.yml) defines a tiny kubernetes job which uses the image, including the command to execute. + +Execute the job via `kubectl create -f k8s/demo_job.yml`. This may require adaptation for your +k8s environment. + +## Using an A100 GPU + +In [peft/starcoder_base_example.yml](./peft/starcoder_base_example.yml), I create a job which +can be used to run the base peft example I created (with TFLOPs calculation) at +[src/hf_libraries_demo/experiments/peft/base_with_tflops.py](../src/hf_libraries_demo/experiments/peft/base_with_tflops.py). +This includes a few new additions: +- [Specifying an affinity for nodes with A100s](./peft/starcoder_base_example.yml#L14) +- [Pulling the W&B API Key from a secret](./peft/starcoder_base_example.yml#27) + - Creation of secret not shown, but some [more info here](https://kubernetes.io/docs/concepts/configuration/secret/). I use [Lens](https://k8slens.dev/) to make some of this easier, but it is not particularly light-weight +- Increasing our CPU/Memory requests and specifying we need 1 GPU (affinity handles type) +- Adjusting command executed to log in to huggingface and set its cache directories to a path on a mounted volume. This allows re-use +of downloaded weights and datasets on subsequent job runs. +- mounting the volume mentioned above (`volumeMounts`) +- [A toleration](./peft/starcoder_base_example.yml#27) which prevents the job from running if no A100s are available yet \ No newline at end of file diff --git a/k8s/demo_job.yml b/k8s/demo_job.yml new file mode 100644 index 0000000..28ef4cb --- /dev/null +++ b/k8s/demo_job.yml @@ -0,0 +1,52 @@ +apiVersion: batch/v1 +kind: Job +# This is used for naming the job and pod, and letting other cluster/namespace users know I created it +metadata: + generateName: bking2--hf-libraries-demo- + labels: + user: bking2 + k8s-app: bking2-hf-libraries-demo +spec: + template: + spec: + # Here is where we define the core parts of the job. We need 1) the Docker image 2) it's environment requirements + # (CPU/Memory/GPU) and 3) the command that gets run + containers: + - name: bking2-hf-libraries-demo + image: kingb12/hf_libraries_demo:latest + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi + cpu: 1 + command: [ "/bin/sh" ] + # everything after 'job ready to start' is the script we want to run. Using + # conda run --no-capture-output -p ./venv runs things with the correct conda environment + args: + - -c + - >- + cd /home/bking2/hf_libraries_demo && + echo "job ready to start" && + echo "import hf_libraries_demo.package_demo.addition_module as mymod\nprint(f'4 + 5 is {mymod.add_five_to_x(4)}')" > demo.py && + conda run --no-capture-output -p ./venv python demo.py && + echo "job complete!" + # some arguments needed by kubernetes, plus some useful defaults + restartPolicy: Never + schedulerName: default-scheduler + securityContext: { } + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + # tolerations are used to define what to do if the cluster isn't ready, can't be reached, etc. Other tolerations + # can be used to define what to do when resources are inadequate for our requests/limits + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 \ No newline at end of file diff --git a/k8s/peft/README.md b/k8s/peft/README.md deleted file mode 100644 index ed4ad76..0000000 --- a/k8s/peft/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Using an A100 on Nautilus with Kubernetes - -In [the PEFT/StarCoder example](../../src/hf_libraries_demo/experiments/peft), I (try to) optimize performance for use -with a single A100 GPU. For me to access an A100, I have to do so via the -[Nautilus cluster](https://portal.nrp-nautilus.io/). - -Here is my kubernetes & docker setups for doing this, some parts may be useful for your own or not. - -1. -2. Docker Build Github Action: automatically creates and pushes a \ No newline at end of file diff --git a/k8s/peft/starcoder_base_example.yml b/k8s/peft/starcoder_base_example.yml new file mode 100644 index 0000000..991e34e --- /dev/null +++ b/k8s/peft/starcoder_base_example.yml @@ -0,0 +1,90 @@ +apiVersion: batch/v1 +kind: Job +# This is used for naming the job and pod, and letting other cluster/namespace users know I created it +metadata: + generateName: bking2--hf-libraries-demo- + labels: + user: bking2 + k8s-app: bking2-hf-libraries-demo +spec: + template: + spec: + # Here we additionally specify that we need our pod (created by the job) to attach to a node with an A100 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + - NVIDIA-A100-SXM4-80GB + # Here is where we define the core parts of the job. We need 1) the Docker image 2) it's environment requirements + # (CPU/Memory/GPU) and 3) the command that gets run + containers: + - name: bking2-hf-libraries-demo + image: kingb12/hf_libraries_demo:latest + # Here I've added a secret for my weights and biases API key, so the job + # can create logs + envFrom: + - secretRef: + name: bking2-wandb-api-key-71a5 + resources: + limits: + memory: 64Gi + cpu: 32 + nvidia.com/gpu: "1" + requests: + memory: 32Gi + cpu: 16 + nvidia.com/gpu: "1" + command: [ "/bin/sh" ] + # This includes further setup to 1) cache transformers and datasets on my volume so weights don't need to be + # re-downloaded on each run and 2) log in to huggingface since Starcoder is agreement protected. + # everything after 'job ready to start' is the script we want to run. Using + # conda run --no-capture-output -p ./venv runs things with the correct conda environment + args: + - -c + - >- + cd /home/bking2/hf_libraries_demo && + export TRANSFORMERS_CACHE=/data/users/bking2/.cache/huggingface && + export HF_HOME=/data/users/bking2/.cache/huggingface && + pip install huggingface_hub && + python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('${HF_API_TOKEN}')" && + echo "job ready to start" && + echo "import hf_libraries_demo.package_demo.addition_module as mymod\nprint(f'4 + 5 is {mymod.add_five_to_x(4)}')" > demo.py && + conda run --no-capture-output -p ./venv python src/hf_libraries_demo/experiments/peft/base_with_tflops.py && + echo "job complete!" + # some arguments needed by kubernetes, plus some useful defaults + volumeMounts: + - mountPath: /data/users/bking2 + name: bking2-data-volume + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + # tolerations are used to define what to do if the cluster isn't ready, can't be reached, etc. Other tolerations + # can be used to define what to do when resources are inadequate for our requests/limits + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + # We add a toleration telling k8s not to schedule our job if no A100s are available yet + - effect: PreferNoSchedule + key: nvidia.com/gpu + operator: Exists + # here we specify the data volume as well. So far, I just use this for caching transformer/dataset weights + # See https://ucsd-prp.gitlab.io/userdocs/tutorial/storage/ for info on creating a data volume to mount to like + # this (pre-requisite to mounting as in this job, not shown in repo) + volumes: + - name: bking2-data-volume + persistentVolumeClaim: + claimName: bking2-data-volume + backoffLimit: 0 diff --git a/setup.cfg b/setup.cfg index d111d33..b6c9225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,6 +21,8 @@ install_requires= torch~=1.13.1 transformers~=4.26.0 datasets~=2.9.0 + evaluate~=0.4.0 + wandb~=0.15.4 sklearn [options.package_data] # If any package or subpackage contains *.txt or *.sql files, include them: