diff --git a/examples/gke-storage-hyperdisk.yaml b/examples/gke-storage-hyperdisk.yaml new file mode 100644 index 0000000000..12c8063026 --- /dev/null +++ b/examples/gke-storage-hyperdisk.yaml @@ -0,0 +1,218 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +blueprint_name: gke-storage-hyperdisk +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-storage-hyperdisk + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet-hyperdisk + secondary_ranges: + gke-subnet-hyperdisk: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network] + settings: + enable_persistent_disk_csi: true # enable Hyperdisk for the cluster + configure_workload_identity_sa: true + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + outputs: [instructions] + + ### Set up storage class and persistent volume claim for Hyperdisk ### + - id: hyperdisk-balanced-setup + source: modules/file-system/gke-storage + use: [gke_cluster] + settings: + storage_type: Hyperdisk-balanced + access_mode: ReadWriteOnce + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete + sc_topology_zones: [$(vars.zone)] + pvc_count: 1 + capacity_gb: 100 + + - id: hyperdisk-throughput-setup + source: modules/file-system/gke-storage + use: [gke_cluster] + settings: + storage_type: Hyperdisk-throughput + access_mode: ReadWriteOnce + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete + sc_topology_zones: [$(vars.zone)] + pvc_count: 1 + capacity_gb: 5000 + + - id: hyperdisk-extreme-setup + source: modules/file-system/gke-storage + use: [gke_cluster] + settings: + storage_type: Hyperdisk-extreme + access_mode: ReadWriteOnce + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete + sc_topology_zones: [$(vars.zone)] + pvc_count: 1 + capacity_gb: 100 + + - id: sample-pool + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: sample-pool + zones: [$(vars.zone)] + machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs + + # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE + # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample + - id: hyperdisk-balanced-job + source: modules/compute/gke-job-template + use: + - gke_cluster + - hyperdisk-balanced-setup + settings: + name: tensorflow + image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d + security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk + - key: runAsUser + value: 1000 + - key: runAsGroup + value: 100 + - key: fsGroup + value: 100 + command: + - bash + - -c + - | + pip install transformers datasets + python - < [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
Supported value:
- Retain
- Delete | `string` | n/a | yes | | [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no | | [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
Supported value:
- Immediate
- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no | -| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore | `string` | n/a | yes | +| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore
- Hyperdisk-balanced
- Hyperdisk-throughput
- Hyperdisk-extreme | `string` | n/a | yes | ## Outputs diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl new file mode 100644 index 0000000000..32781be2fb --- /dev/null +++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ${pvc_name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +spec: + accessModes: + - ${access_mode} + resources: + requests: + storage: ${capacity} + storageClassName: ${storage_class_name} diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl new file mode 100644 index 0000000000..32781be2fb --- /dev/null +++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ${pvc_name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +spec: + accessModes: + - ${access_mode} + resources: + requests: + storage: ${capacity} + storageClassName: ${storage_class_name} diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl new file mode 100644 index 0000000000..32781be2fb --- /dev/null +++ b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ${pvc_name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +spec: + accessModes: + - ${access_mode} + resources: + requests: + storage: ${capacity} + storageClassName: ${storage_class_name} diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl new file mode 100644 index 0000000000..46e1f023d3 --- /dev/null +++ b/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl @@ -0,0 +1,25 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +provisioner: pd.csi.storage.gke.io +allowVolumeExpansion: true +parameters: + type: hyperdisk-balanced + provisioned-throughput-on-create: "250Mi" + provisioned-iops-on-create: "7000" +volumeBindingMode: ${volume_binding_mode} +reclaimPolicy: ${reclaim_policy} + %{~ if topology_zones != null ~} +allowedTopologies: +- matchLabelExpressions: + - key: topology.gke.io/zone + values: + %{~ for z in topology_zones ~} + - ${z} + %{~ endfor ~} + %{~ endif ~} diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl new file mode 100644 index 0000000000..445020d001 --- /dev/null +++ b/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl @@ -0,0 +1,24 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} +provisioner: pd.csi.storage.gke.io +allowVolumeExpansion: true +parameters: + %{~ endfor ~} + type: hyperdisk-extreme + provisioned-iops-on-create: "50000" +volumeBindingMode: ${volume_binding_mode} +reclaimPolicy: ${reclaim_policy} + %{~ if topology_zones != null ~} +allowedTopologies: +- matchLabelExpressions: + - key: topology.gke.io/zone + values: + %{~ for z in topology_zones ~} + - ${z} + %{~ endfor ~} + %{~ endif ~} diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl new file mode 100644 index 0000000000..ec404aec45 --- /dev/null +++ b/modules/file-system/gke-storage/storage-class/hyperdisk-throughput-sc.yaml.tftpl @@ -0,0 +1,24 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +provisioner: pd.csi.storage.gke.io +allowVolumeExpansion: true +parameters: + type: hyperdisk-throughput + provisioned-throughput-on-create: "250Mi" +volumeBindingMode: ${volume_binding_mode} +reclaimPolicy: ${reclaim_policy} + %{~ if topology_zones != null ~} +allowedTopologies: +- matchLabelExpressions: + - key: topology.gke.io/zone + values: + %{~ for z in topology_zones ~} + - ${z} + %{~ endfor ~} + %{~ endif ~} diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf index 9ad3b839d8..9f6224bbac 100644 --- a/modules/file-system/gke-storage/variables.tf +++ b/modules/file-system/gke-storage/variables.tf @@ -34,12 +34,15 @@ variable "storage_type" { The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview) to used. This module currently support dynamic provisioning for the below storage options - Parallelstore + - Hyperdisk-balanced + - Hyperdisk-throughput + - Hyperdisk-extreme EOT type = string nullable = false validation { - condition = var.storage_type == null ? false : contains(["parallelstore"], lower(var.storage_type)) - error_message = "Allowed string values for var.storage_type are \"Parallelstore\"." + condition = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type)) + error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"." } } diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-hyperdisk.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-hyperdisk.yml new file mode 100644 index 0000000000..fb114c402a --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-hyperdisk.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Get cluster credentials for kubectl + delegate_to: localhost + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} + +- name: Execute the job + delegate_to: localhost + ansible.builtin.shell: | + jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*) + for job in "${jobs[@]}"; do + kubectl create -f "$job" + done + args: + executable: /bin/bash + changed_when: False + +- name: Wait for job to complete + delegate_to: localhost + ansible.builtin.command: | + kubectl get job --field-selector status.successful=1 + register: job_completion + until: job_completion.stdout_lines | length > 3 # 3 jobs total + retries: 80 + delay: 15 + +- name: Print job_completion debug output + ansible.builtin.debug: + var: job_completion.stdout_lines diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-hyperdisk.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-hyperdisk.yaml new file mode 100644 index 0000000000..a85a398e50 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/gke-storage-hyperdisk.yaml @@ -0,0 +1,59 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-job-template +- m.gke-node-pool +- m.gke-storage +- m.vpc +- gke + +timeout: 14400s # 4hr + +steps: +## Test GKE +- id: gke-storage-hyperdisk + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + SG_EXAMPLE=examples/gke-storage-hyperdisk.yaml + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${SG_EXAMPLE} + echo ' source: modules/compute/vm-instance' >> $${SG_EXAMPLE} + echo ' use: [network]' >> $${SG_EXAMPLE} + echo ' settings:' >> $${SG_EXAMPLE} + echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} + echo ' zone: us-central1-a' >> $${SG_EXAMPLE} + + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + + IP=$(curl ifconfig.me) + sed -i "s//$${IP}/" $${SG_EXAMPLE} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-hyperdisk.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-hyperdisk.yml b/tools/cloud-build/daily-tests/tests/gke-storage-hyperdisk.yml new file mode 100644 index 0000000000..805721b1a2 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/gke-storage-hyperdisk.yml @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +test_name: gke-storage-hyperdisk +deployment_name: gke-storage-hyperdisk-{{ build }} +zone: us-central1-a # for remote node +region: us-central1 +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/gke-storage-hyperdisk.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-0" +post_deploy_tests: +- test-validation/test-gke-storage-hyperdisk.yml +custom_vars: + project: "{{ project }}" +cli_deployment_vars: + region: "{{ region }}"