From 70ad46bee5a0ebcb41423430e798b7c6a9e25489 Mon Sep 17 00:00:00 2001 From: jooho lee Date: Thu, 14 Nov 2024 21:05:32 -0500 Subject: [PATCH 1/8] add ray tls feature Signed-off-by: jooho lee --- config/overlays/odh/kustomization.yaml | 3 +- config/overlays/odh/ray_tls_configmap.yaml | 80 +++++++++++++++++++ config/runtimes/vllm-multinode-template.yaml | 83 +++++++++++++++++++- 3 files changed, 161 insertions(+), 5 deletions(-) create mode 100644 config/overlays/odh/ray_tls_configmap.yaml diff --git a/config/overlays/odh/kustomization.yaml b/config/overlays/odh/kustomization.yaml index faa23332..dc5217d1 100644 --- a/config/overlays/odh/kustomization.yaml +++ b/config/overlays/odh/kustomization.yaml @@ -2,9 +2,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ../../default + - ./ray_tls_configmap.yaml patches: - path: odh_model_controller_manager_patch.yaml configurations: - - params.yaml \ No newline at end of file + - params.yaml diff --git a/config/overlays/odh/ray_tls_configmap.yaml b/config/overlays/odh/ray_tls_configmap.yaml new file mode 100644 index 00000000..7db10f65 --- /dev/null +++ b/config/overlays/odh/ray_tls_configmap.yaml @@ -0,0 +1,80 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ca-tls +data: + # output from cat ca.crt | base64 + ca.crt: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM3RENDQWRRQ0NRQ05Yck8zQTAwbWRqQU5CZ2txaGtpRzl3MEJBUXNGQURBNE1SRXdEd1lEVlFRRERBZ3EKTG5KaGVTNXBiekVMTUFrR0ExVUVCaE1DVlZNeEZqQVVCZ05WQkFjTURWTmhiaUJHY21GdVkybHpZMjh3SGhjTgpNak13TXpJM01EZ3dNVFF4V2hjTk16TXdNekkwTURnd01UUXhXakE0TVJFd0R3WURWUVFEREFncUxuSmhlUzVwCmJ6RUxNQWtHQTFVRUJoTUNWVk14RmpBVUJnTlZCQWNNRFZOaGJpQkdjbUZ1WTJselkyOHdnZ0VpTUEwR0NTcUcKU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLQW9JQkFRQ3ZJbGNGSmZxaFNidWowQ3ZpalA0c2xXN3I3Qk1kYVJOeAp5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUa0Exb3FxVHhGdTZMSm5LOGJHN012Cm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRc0tVRDNJN3U3QjF5bVpxTjQwWEgKWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2hHRzA5RmNtZlF5REFlM2VvTm1IWQpVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLM2tXbTZnUytwQTIvdkZIaU93RHNaClNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3RTRVSG4wN0pBZ01CQUFFd0RRWUoKS29aSWh2Y05BUUVMQlFBRGdnRUJBQWhSY3g2NzVJbjJVaERhMzArTkZ0UlNTcUJwK1E2WTl3VGNTL0NqM1J3MgpLSnkzUVhBU0xJUW1ESWdrVlBJeEY0V1VYUFdGdmxUL0taQ2JRejRvN2M3ck9DWEVEWnVhbExUSHRrTHVSZFNWClVHSTVSWTJXNUx6UXM2MnNtUG13OWVQYnNLek5kOEpjWkwvNndHZnNsZVQyY1RLTjliZVE2ZWdiQmdEcy91d0sKeVdOREtnaE4vaE16YmRSaFh2SFNiTW8rUkgvRG1Va1VhTXZZc3NNbzFYQkwzRXZwbmpnZXI1ZWQ5ZDVjQWYvUQpuU0VCMk13Z08rWHEwKy9sWmpiUFNWOVdWQnY1YjZlc1ZPcnZrV2o2TUFKcjUwb3BwT09KUy9TbTNEU3F5aDRBClR5c1BOblQxYStxWDRVZXljZ05VbXRoOXdONFBnc3B6ZEpORWtVdTVSSmM9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + # output from cat ca.key | base64 + ca.key: | + LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktnd2dnU2tBZ0VBQW9JQkFRQ3ZJbGNGSmZxaFNidWoKMEN2aWpQNHNsVzdyN0JNZGFSTnh5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUawpBMW9xcVR4RnU2TEpuSzhiRzdNdm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRCnNLVUQzSTd1N0IxeW1acU40MFhIWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2gKR0cwOUZjbWZReURBZTNlb05tSFlVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLMwprV202Z1MrcEEyL3ZGSGlPd0RzWlNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3CkU0VUhuMDdKQWdNQkFBRUNnZ0VCQUpYbG9XK2hveE83UlNRZmdBQkhSeUdud1NtaWhIWE93cnJKRWFqOXkyVncKRzBOTC9ka3Vld1ZpUGxwR3Z0c0hhMlVkTitkYXpUem1aMEkxY0U1RlRYWXQ5RlgxaXBaOExmRGV4cEFJOXNSVQo0bS9Ld3dRckZVdnZvWGE0YWtOMHBxQm1Kd2xNWHVPRmdOZEJLZXZWTW0xaW9JMisxTjhPb0dIVjlvdGFydks5ClUzY09CbmVBSjZmamF6ODd4RG1NY0dBcG82ZWdMOG0xaWJ1NUNwcFo2L2J2YVZYbHhFdXRtUjZYR2VKczdBRzMKVEtFYVhzTU1qdFdaM3ZXUDArMFJIMGpzRVI2a0ZMeEI3KzRHRWdPSk1WblZqbjlzT3FhVW41KzJ1REkrdkFkbAo0K2Fya3dwQnpzbGlaUVJLVW95aGwvMTRRZW9pcXpwVk9oVVpheFJOTnpVQ2dZRUE0RC83TmRudGFxa0JSdEdiClZUQTE0clA3Vy90THZSQnpBNWtLc3Q4V3crWFVGeTcvVEY2NkxpMVVtKzhhK2ttY1pMTm9mamNZc1pTMExkVXMKMlR4dk1IRWplcmdNUm5oWmUrOGJBZlZkd1RTcCtpdUpMKzRZWW1JRUZWUWlsSXhtRURzMkZQSnVZMHRDZW9ETAprVEFSeUNtMENPYUR1VXdLZjlMY3h3SFR4M3NDZ1lFQXgrNGpyOHV3aXh3WmwwUFBJb1Z1by9wSHZMT0ZxNXNBCmIrVEZnMEhFTVdIK1JKclhLRjA1YTRGNS9zc3pLZ09ZMGFZVUxlWnp3V1dJZElId0pzQnhGWktOdHRYTkhRbS8KOEFlVGRENnZ1OXlmN0tFZjhRNnFmaDRPRExvVDg0UTFWbGs4ek5ZN0FNUWZwN2p5RnpFOStvSm9tdlM0Snc1SApCZUNLZGZGR1RZc0NnWUVBaitkL0JhZTd1MTZJK3pFM1JRdVRDTkFHMVpnRm1tWWI2SXNsV25QZTRBZDBld3dsCnVKUnhWWUN4Y3YrVmlGZ0VqSHEwNjRuZnh0VnVhcHNLRkwyN2ZKS2QrZnB4cGlkRkJVc0RRZFo3TzZqWUN6bzAKNXhVYmdNYjFaOXA5OW1YQ2VWZ0Y5SnMrUzJuWVYxU2ZUYVJUUk9lK0tKZ0VuN3cwWUtLb0d1MEpRbEVDZ1lCZApZdXJnYm5Ca1NoZmFCQjU0cllMa3JUOWM4UzM2M2tmeC9CWVdIVjRiQXY3VjVNMmpXUWc5SXhsczNsVmp4cEpYCk94QXA4SDhaVXVmT0kvT2M1ajdzS0t4eFBxUzBiNTFyN04zL2FsaURrNlpQeldNeUlmdVpOVWl5d1NnWWt5U20KMU1BRm5mdXBlL0tkVVZJamF5amNIcFhsNjNFcExRNFh2SzV3TU9iNXlRS0JnR0kzSTAwSTlnbURzS1JrOFkxdQpId1l0dVdrNjFvWEhUTHorR3d6RUNCQ0VnNkZxMjZVeDZmVzBySlVwV3pOVURCNkRRRGxCTGx3S1M4Z1R3eGtGCkRkY3VrbzFHekdlQWYvazEwWktTZmFXNVcwVlloVGVjSDhyZXpReWxwUk5YT3ZNZkFwWUplcnhBZ09yK3hFajUKK2wwalU0MDBTMUx0cWhLVzZMK3kxRVd5Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tls +data: + gencert_ray.sh: | + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl + + ## Generate tls.cert + openssl x509 -req \ + -in /etc/ray/tls/ca.csr \ + -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ + -days 36500 \ + -sha256 -extfile /etc/ray/tls/cert.conf + diff --git a/config/runtimes/vllm-multinode-template.yaml b/config/runtimes/vllm-multinode-template.yaml index ff95bf88..0be788e1 100644 --- a/config/runtimes/vllm-multinode-template.yaml +++ b/config/runtimes/vllm-multinode-template.yaml @@ -13,7 +13,7 @@ metadata: template.openshift.io/long-description: This template defines resources needed to deploy vLLM ServingRuntime Multi-Node with KServe in Red Hat OpenShift AI opendatahub.io/modelServingSupport: '["single"]' opendatahub.io/apiProtocol: "REST" - name: vllm-multinode-runtime-template + name: vllm-multinode-runtime-template-test objects: - apiVersion: serving.kserve.io/v1alpha1 kind: ServingRuntime @@ -38,9 +38,14 @@ objects: image: $(vllm-image) command: [ "bash", "-c" ] args: - - | + - | + # Generate self signed certificate + if [[ $RAY_USE_TLS == "1" ]]; then + /etc/gen/tls/gencert_ray.sh + fi ray start --head --disable-usage-stats --include-dashboard false - # wait for other node to join + + # Wait for other node to join until [[ $(ray status --address ${RAY_ADDRESS} | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do echo "Waiting..." sleep 1 @@ -52,6 +57,14 @@ objects: exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce env: + - name: RAY_USE_TLS + value: "1" + - name: RAY_TLS_SERVER_CERT + value: "/etc/ray/tls/tls.crt" + - name: RAY_TLS_SERVER_KEY + value: "/etc/ray/tls/tls.key" + - name: RAY_TLS_CA_CERT + value: "/etc/ca/tls/ca.crt" - name: RAY_PORT value: "6379" - name: RAY_ADDRESS @@ -60,6 +73,10 @@ objects: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP - name: VLLM_NO_USAGE_STATS value: "1" - name: HOME @@ -76,6 +93,13 @@ objects: volumeMounts: - name: shm mountPath: /dev/shm + - mountPath: /etc/ca/tls + name: ca-tls + readOnly: true + - mountPath: /etc/ray/tls + name: ray-tls + - mountPath: /etc/gen/tls + name: gen-tls-script livenessProbe: failureThreshold: 2 periodSeconds: 5 @@ -177,6 +201,20 @@ objects: emptyDir: medium: Memory sizeLimit: 12Gi + - name: ca-tls + secret: + secretName: ca-tls + - name: ray-tls + emptyDir: {} + # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional + - name: gen-tls-script + configMap: + name: tls + defaultMode: 0777 + # An array of keys from the ConfigMap to create as files + items: + - key: gencert_ray.sh + path: gencert_ray.sh workerSpec: pipelineParallelSize: 2 tensorParallelSize: 1 @@ -186,6 +224,10 @@ objects: command: [ "bash", "-c" ] args: - | + # Generate self signed certificate + if [[ $RAY_USE_TLS == "1" ]]; then + /etc/gen/tls/gencert_ray.sh + fi SECONDS=0 while true; do @@ -211,10 +253,22 @@ objects: echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..." ray start --address="${RAY_HEAD_ADDRESS}" --block env: + - name: RAY_USE_TLS + value: "1" + - name: RAY_TLS_SERVER_CERT + value: "/etc/ray/tls/tls.crt" + - name: RAY_TLS_SERVER_KEY + value: "/etc/ray/tls/tls.key" + - name: RAY_TLS_CA_CERT + value: "/etc/ca/tls/ca.crt" - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP - name: POD_NAMESPACE valueFrom: fieldRef: @@ -229,6 +283,13 @@ objects: volumeMounts: - name: shm mountPath: /dev/shm + - mountPath: /etc/ca/tls + name: ca-tls + readOnly: true + - mountPath: /etc/ray/tls + name: ray-tls + - mountPath: /etc/gen/tls + name: gen-tls-script livenessProbe: failureThreshold: 2 periodSeconds: 5 @@ -276,4 +337,18 @@ objects: - name: shm emptyDir: medium: Memory - sizeLimit: 12Gi + sizeLimit: 12Gi + - name: ray-tls + emptyDir: {} + - name: ca-tls + secret: + secretName: ca-tls + # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional + - name: gen-tls-script + configMap: + name: tls + defaultMode: 0777 + # An array of keys from the ConfigMap to create as files + items: + - key: gencert_ray.sh + path: gencert_ray.sh From bfeb9fbf256a5b84c26db1e18d1b2b32de0bb24b Mon Sep 17 00:00:00 2001 From: jooho lee Date: Sat, 23 Nov 2024 11:22:08 -0500 Subject: [PATCH 2/8] add copy logic for ray resource to targeNs Signed-off-by: jooho lee --- config/overlays/odh/kustomization.yaml | 2 +- config/overlays/odh/ray_tls_configmap.yaml | 80 ---- config/overlays/odh/ray_tls_resources.yaml | 83 ++++ config/runtimes/vllm-multinode-template.yaml | 96 ++--- controllers/constants/constants.go | 6 + controllers/kserve_ray_tls_controller.go | 427 +++++++++++++++++++ main.go | 8 + 7 files changed, 573 insertions(+), 129 deletions(-) delete mode 100644 config/overlays/odh/ray_tls_configmap.yaml create mode 100644 config/overlays/odh/ray_tls_resources.yaml create mode 100644 controllers/kserve_ray_tls_controller.go diff --git a/config/overlays/odh/kustomization.yaml b/config/overlays/odh/kustomization.yaml index dc5217d1..a7ee7169 100644 --- a/config/overlays/odh/kustomization.yaml +++ b/config/overlays/odh/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ../../default - - ./ray_tls_configmap.yaml + - ./ray_tls_resources.yaml patches: - path: odh_model_controller_manager_patch.yaml diff --git a/config/overlays/odh/ray_tls_configmap.yaml b/config/overlays/odh/ray_tls_configmap.yaml deleted file mode 100644 index 7db10f65..00000000 --- a/config/overlays/odh/ray_tls_configmap.yaml +++ /dev/null @@ -1,80 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: ca-tls -data: - # output from cat ca.crt | base64 - ca.crt: | - LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM3RENDQWRRQ0NRQ05Yck8zQTAwbWRqQU5CZ2txaGtpRzl3MEJBUXNGQURBNE1SRXdEd1lEVlFRRERBZ3EKTG5KaGVTNXBiekVMTUFrR0ExVUVCaE1DVlZNeEZqQVVCZ05WQkFjTURWTmhiaUJHY21GdVkybHpZMjh3SGhjTgpNak13TXpJM01EZ3dNVFF4V2hjTk16TXdNekkwTURnd01UUXhXakE0TVJFd0R3WURWUVFEREFncUxuSmhlUzVwCmJ6RUxNQWtHQTFVRUJoTUNWVk14RmpBVUJnTlZCQWNNRFZOaGJpQkdjbUZ1WTJselkyOHdnZ0VpTUEwR0NTcUcKU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLQW9JQkFRQ3ZJbGNGSmZxaFNidWowQ3ZpalA0c2xXN3I3Qk1kYVJOeAp5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUa0Exb3FxVHhGdTZMSm5LOGJHN012Cm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRc0tVRDNJN3U3QjF5bVpxTjQwWEgKWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2hHRzA5RmNtZlF5REFlM2VvTm1IWQpVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLM2tXbTZnUytwQTIvdkZIaU93RHNaClNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3RTRVSG4wN0pBZ01CQUFFd0RRWUoKS29aSWh2Y05BUUVMQlFBRGdnRUJBQWhSY3g2NzVJbjJVaERhMzArTkZ0UlNTcUJwK1E2WTl3VGNTL0NqM1J3MgpLSnkzUVhBU0xJUW1ESWdrVlBJeEY0V1VYUFdGdmxUL0taQ2JRejRvN2M3ck9DWEVEWnVhbExUSHRrTHVSZFNWClVHSTVSWTJXNUx6UXM2MnNtUG13OWVQYnNLek5kOEpjWkwvNndHZnNsZVQyY1RLTjliZVE2ZWdiQmdEcy91d0sKeVdOREtnaE4vaE16YmRSaFh2SFNiTW8rUkgvRG1Va1VhTXZZc3NNbzFYQkwzRXZwbmpnZXI1ZWQ5ZDVjQWYvUQpuU0VCMk13Z08rWHEwKy9sWmpiUFNWOVdWQnY1YjZlc1ZPcnZrV2o2TUFKcjUwb3BwT09KUy9TbTNEU3F5aDRBClR5c1BOblQxYStxWDRVZXljZ05VbXRoOXdONFBnc3B6ZEpORWtVdTVSSmM9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K - # output from cat ca.key | base64 - ca.key: | - LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktnd2dnU2tBZ0VBQW9JQkFRQ3ZJbGNGSmZxaFNidWoKMEN2aWpQNHNsVzdyN0JNZGFSTnh5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUawpBMW9xcVR4RnU2TEpuSzhiRzdNdm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRCnNLVUQzSTd1N0IxeW1acU40MFhIWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2gKR0cwOUZjbWZReURBZTNlb05tSFlVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLMwprV202Z1MrcEEyL3ZGSGlPd0RzWlNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3CkU0VUhuMDdKQWdNQkFBRUNnZ0VCQUpYbG9XK2hveE83UlNRZmdBQkhSeUdud1NtaWhIWE93cnJKRWFqOXkyVncKRzBOTC9ka3Vld1ZpUGxwR3Z0c0hhMlVkTitkYXpUem1aMEkxY0U1RlRYWXQ5RlgxaXBaOExmRGV4cEFJOXNSVQo0bS9Ld3dRckZVdnZvWGE0YWtOMHBxQm1Kd2xNWHVPRmdOZEJLZXZWTW0xaW9JMisxTjhPb0dIVjlvdGFydks5ClUzY09CbmVBSjZmamF6ODd4RG1NY0dBcG82ZWdMOG0xaWJ1NUNwcFo2L2J2YVZYbHhFdXRtUjZYR2VKczdBRzMKVEtFYVhzTU1qdFdaM3ZXUDArMFJIMGpzRVI2a0ZMeEI3KzRHRWdPSk1WblZqbjlzT3FhVW41KzJ1REkrdkFkbAo0K2Fya3dwQnpzbGlaUVJLVW95aGwvMTRRZW9pcXpwVk9oVVpheFJOTnpVQ2dZRUE0RC83TmRudGFxa0JSdEdiClZUQTE0clA3Vy90THZSQnpBNWtLc3Q4V3crWFVGeTcvVEY2NkxpMVVtKzhhK2ttY1pMTm9mamNZc1pTMExkVXMKMlR4dk1IRWplcmdNUm5oWmUrOGJBZlZkd1RTcCtpdUpMKzRZWW1JRUZWUWlsSXhtRURzMkZQSnVZMHRDZW9ETAprVEFSeUNtMENPYUR1VXdLZjlMY3h3SFR4M3NDZ1lFQXgrNGpyOHV3aXh3WmwwUFBJb1Z1by9wSHZMT0ZxNXNBCmIrVEZnMEhFTVdIK1JKclhLRjA1YTRGNS9zc3pLZ09ZMGFZVUxlWnp3V1dJZElId0pzQnhGWktOdHRYTkhRbS8KOEFlVGRENnZ1OXlmN0tFZjhRNnFmaDRPRExvVDg0UTFWbGs4ek5ZN0FNUWZwN2p5RnpFOStvSm9tdlM0Snc1SApCZUNLZGZGR1RZc0NnWUVBaitkL0JhZTd1MTZJK3pFM1JRdVRDTkFHMVpnRm1tWWI2SXNsV25QZTRBZDBld3dsCnVKUnhWWUN4Y3YrVmlGZ0VqSHEwNjRuZnh0VnVhcHNLRkwyN2ZKS2QrZnB4cGlkRkJVc0RRZFo3TzZqWUN6bzAKNXhVYmdNYjFaOXA5OW1YQ2VWZ0Y5SnMrUzJuWVYxU2ZUYVJUUk9lK0tKZ0VuN3cwWUtLb0d1MEpRbEVDZ1lCZApZdXJnYm5Ca1NoZmFCQjU0cllMa3JUOWM4UzM2M2tmeC9CWVdIVjRiQXY3VjVNMmpXUWc5SXhsczNsVmp4cEpYCk94QXA4SDhaVXVmT0kvT2M1ajdzS0t4eFBxUzBiNTFyN04zL2FsaURrNlpQeldNeUlmdVpOVWl5d1NnWWt5U20KMU1BRm5mdXBlL0tkVVZJamF5amNIcFhsNjNFcExRNFh2SzV3TU9iNXlRS0JnR0kzSTAwSTlnbURzS1JrOFkxdQpId1l0dVdrNjFvWEhUTHorR3d6RUNCQ0VnNkZxMjZVeDZmVzBySlVwV3pOVURCNkRRRGxCTGx3S1M4Z1R3eGtGCkRkY3VrbzFHekdlQWYvazEwWktTZmFXNVcwVlloVGVjSDhyZXpReWxwUk5YT3ZNZkFwWUplcnhBZ09yK3hFajUKK2wwalU0MDBTMUx0cWhLVzZMK3kxRVd5Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: tls -data: - gencert_ray.sh: | - #!/bin/sh - ## Create tls.key - openssl genrsa -out /etc/ray/tls/tls.key 2048 - - ## Write CSR Config - cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl - - ## Generate tls.cert - openssl x509 -req \ - -in /etc/ray/tls/ca.csr \ - -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ - -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ - -days 36500 \ - -sha256 -extfile /etc/ray/tls/cert.conf - diff --git a/config/overlays/odh/ray_tls_resources.yaml b/config/overlays/odh/ray_tls_resources.yaml new file mode 100644 index 00000000..02b6c261 --- /dev/null +++ b/config/overlays/odh/ray_tls_resources.yaml @@ -0,0 +1,83 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ray-ca-cert + labels: + opendatahub.io/managed: 'true' +data: + # output from cat ca.crt | base64 + ca.crt: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + # output from cat ca.key | base64 + ca.key: | + LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUpRd0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQ1Mwd2dna3BBZ0VBQW9JQ0FRQy95SW1GbVpodUFqWS8KZjkrY3hBV3NqSnY0SE1ZU0J6L29FdkNqZEw2RjFvdnJ6Q2NGdzBIY0RnVGJSV1pHTGMyU0xLQXg3aHM3WU93MQplSzNyemVEU1BMU1RZT3FHMzVDbEtYZThjS09MQjdyTk1COCtYcVhaTHRQREZ1YkhWVFQwa05oNGxjVzQyVmlhClVHU0duOWdIU1pHWlQxcGs5R1NtVHJVQSs0OWlPSnQ3Z2NabkhQRXVZVytpV1dxREpYRkZNWGVOWmFaK3pwbTkKR2xlQUQzMjJQNkhQcVgzZ3JON3FaNm81V0kzaFFlTHhPY1dER0ZnRE0vaUE0Y1JPS2cza1lyQ3djV3UyUDZVMAphdXZYbWxMTXdmY3NRK3hnS3lOcFIvS3ovbE1ZeFdpTHZwQUZlWnArL1BjSHMvTFMrUmVBUG1VUGFYUFRFZWtLClozWFk4bVJNUDZXWGJ1OUZZKzJBT2RUcmlvOUt0YjdaZThEenFkRTdNcVQvR2Jwdkh4VWVKeVozRG9WeXpIc0oKV2pKUU1WV1VIVzlBTXJRT2pDTGJWbGlkUGY2US9YOHlSbkJiemY1L1lPbFlDZmV0MnAzWVN5OW0wUzdCQ1JaVQp6QTd0aDhwZ2VnUHFCWHFlM0hyS3ZTU0ExM1ZXb3hyS3RvZm5tVHVJS21KNEYzSHdHTW5pSVhQVitxb1RuZjFZCm1IK3NoWWZkNTBiVnkwRExnVU8ydVdOYkJFc0E4OEdHeWJTcC9ySktZZlJ5Z2RlMUJCRFVxVE9LMGVNS3N3ejgKS1NYMk1wTDVHOXNKZURRZ2FJM0lqbTZldEZud2d2a0t0WmVDZkZNMGZDYUdXbXdnZC9aVkthU1pEbytVS202dApOQ1lnTEZ3andXY1BQb0FicmFHd0ZDdUJTSTlIandJREFRQUJBb0lDQUFyYzkwaG9ud3VIWGI3ZmNtU0IxU3JZClZPWWt1WDl6aHQvRWxIb1E5cDNFSSswNWhWaFdCTmpMNjBvYXRuRlhtenk3emZtTWMyRTcyemlPam1OdmpvOGcKY1l4eDlMYmQycG5RWUlBWEJ0eDV5UUxJWUFaSUwySys3NjloRUlLYksvVzQxZG9wN05vekFMQm9MMW1FenlSZgpWS0hFU0ZDMHptS3hNOUpMYllYeWowMm9QbUhBY0NHdGJHdjFrZGZ4RkdjNldrZy80c0tnY05ld3NueUdTb0lICm8zd21ZSnkvSjUxTDF5QlhPL2J2Y1hobHNMd3djamNCQ0FNUUU0aE42UjJKUUwrdDBEWGt2SjBQcnZzRE9wa0kKakdzTlEzMWVPcEpERmdwL21zNlFNWnpObHhwdXNGQTVnNUNkaUpRMHNkSGpOdUtqTXhyeUxKRk1HY0l0OExEQwpRVzF2akxLR0l1UWtraGwxOWU1S1N2SDdjUjJja0pDME5vTzhnekpudzd0dTRGaHJaK0xQeXF2R3VSYU55a2RmCi9BKzNEOUE2RW1PNWRldFU2RzJkK0l2TmprdG91Z05UalZIUklDbk9oL01zRmlFQXdycHltVVNISzhKTjVpSjIKUm1rNFljNWlXUjhOUWs0Wkh6aVFGSHJSWkh0TW9DcEkvR2ZGcnYyRVE0bFpOOG5tZHdDWDR4a3JObUJ2ZnlIdgpLWW0yMU5VWDc5U3lRbHd5VS9lNUR6eTA3Si9zcTdoVU8xN3hVcXNzTVpZTGZCSFF0VFU0VVAwbnBOZmtxUFM1CjdJRUtIVWwyRlZudXR0THoxc0ZVVHhJTS90aE9lczRtWElrOExiYzI0Yk45VWlteXplVEN1bE83a0hZSDhTVkEKZDJqZFBTZXhZSTdMeWFVNnFHRzVBb0lCQVFEbHlVQk5CaTRNekdxVnh5NjNCY1dyZC9rdVYrYTFLQ3MyYVhzagpLbVlMT0xrSkhUSjI0YU9EWkJBVGxEMllwclZEOUM1UThTeDdPQVdlQ3FqWHd1MndsOTlabXNIVkxiQTZMRUZ6CnBoYTNQVHhkaWFpMElwZVY2ZFpIQnQrdjVDVGsvSnpxSVpjc1J2REFnNTFHYzgxbERxTzFNbnVqbldBcGpSMmMKd05ZVXd6a3hicHVTc1ZHYzFBZ09tVHBHN1MrdWVVQ2FGU0NVaGkyVWVoblM5dkNrU3Y0QTZMRlpiaXhFeWp6aApycU9mN1d1TTVUWkFoTGM2RTJUQnVOeWJlWW9DblFMdHF3dnUxaFhDOGU4TGlQWFRlMVJ4U2x5dXA5RDhiWEZBCjVPVmFUZjAzcFFweURiOXNKeGhLN3FMbUgrSjlUeU5JamhTTUZQT2pKNEJFRTlPOUFvSUJBUURWcVhDMGdCVzUKYlNUWmUzc3l1QVltRi9hVDg1ZFh1NGFTMFBJR09MakE1M2h0RVdLUkJxd1JlU1prSFdtR05uNUIyOXVXTHg2UgpPZjFNOFJkY2NYSnlxMnp1TlBiWkpabllwS0x5N0FjeDBpc1RvMjdpUy9xRS85YndsNUo3QVU5UmZ2K2ZMK2RPCmxqUndRTGUvQ1dSVHVlTlNOSWpPUC96NWRra2J5Z1kvWHZHbmI0RUJheDY4K3J2a0NYbStGdFpXV3VoblM2Uy8KZHh3Ulo2VGRMd09RZTZQSzNzN3F4c2xWNmQ2dmwrSUpwa1VVZmRvWDNyWFlTeGx2cFlQYWJpWEpaVzdQWkZwRQpVQXc0VTFpSzVLMUt5d1ZjaHlhN2tQSlpRNUplS1pUT3lPL1d5ODZLak0vcUd3NUhDR2NOL2VMbDJKUUViUkwvClJiR0pGSmhUalpjN0FvSUJBQXlyNG0zYzcyRXBUSjloMG9PcFA5TksxR1RuMkFNWmFmaWdMSGd0K0Y2YURDb2kKZ0F2cU9YZ2ZabnVONnkrbDBjMGpoQUpXcWx0SkpaWW5oRlFSbmNYbE9oM1kyT09HbDNjOXhZWTVISHVTVnVmWgpsWUlKZms1NERLYnlEQmZJL3ZmWnJsV0M4TEV5WUVoZGVhak83ZjZxcGdCeC9qdHhqRUgrVkNtMndKZDRoSWpqClRwVHlUa3ZWclhRUW94UVNORlRzdnRGQVpRR0x2S3U1Wi84b092RDBhYmxuRzVDUThNUUNXd1VlK2tyeGJzTGcKU1BPWjNmakg1UUNCenppTHBUNnJwZU94VVFFa3NTS0U4T2V6NzhwdnZLSmF0VzIwTjJRVUxQQ2xMcmlpSUZxWApNVkpFeTgrTkFGdnhlTzR6eCt1ZEY1Y0Nyc05pekdTczR2ZmVHQWtDZ2dFQkFMdFRnbWdPd0gxQlR3U0t1Ym4vCkZBMEVCNEV5R2FlbTExY1RjSTY1M21ucXgyL0F4VTFucnlibXRCMGttR2Mra2JYR1FDRE5rUnc4M25NK0VZQlEKU3NwMHQ5MmxmQ05vVHhsZFJ5eDZlZGhaYnNFYUVsYS96SllkQk9NTjBUU2RNbUMrV3ZuRGN5WTRsU014NnFmSQpZVGp6Q25ZQmIweDlWNXVUOUljenVnU0hocEdKTm03Njd3azdQODZ2N0JnWVI3V1FvS0FuOXZxVFFIMldCRHFVClJLakJiaHFvL0h0azdCS3lLRGFGa0gxclZMZWhtN3cvMitrVjl1Z25FcEpJN2tKRDkwSkh0c2liOGdyVU1CWWUKWmp6a0FRQmQwanl5MlhnZndVMWpZWDluTnJoNUdjM3BwVVNZa2d6L05mTlRmRUtPZnovZUxjQzM1dTdMcXIzZQpydzhDZ2dFQkFMT2tsTkJNRVBmM20yTXBjaVRjRmNKb08vZzBMUUpHaTJtWkN6S1g3eDJFS0N2N1ZvVWVtRkk0CjRkRFVmSlBJWlBFTUpkTHRSUy9qUDEyZWkxek9lWHIrVGlUTklpUUVoemRtL0RZWUdjd2hyb0xLNDZVTFJKY0YKYzdxZ2xNQ1Z1MW9DTmtDdTJvZ08renczRm9makJzK1pqcE1BS2kyOTZ2ZDk2YVlYNThYR0RKekdmdjhuZEF1dwpEUmU1ZE5oQU5iaHZqSlM1VXJwNnhoMVMycTNYOHorTlFGWW9CNDM1Q2NXNW50WWMzemIxYVdzY0NxMWJsUGJGCjc0QTFLTHJNNlpvU0ZlcUVWZzhvajhpWjlDaitiTTJXYm9BREIvRTROM0kyNmFDK1dDRWxtdTd3ZDdQaExQT2IKN3RrTXh2Zm10dDE5T2dYbTRKZm9SZWlkMTNYbHFoZz0KLS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQo= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-tls-scripts + labels: + opendatahub.io/managed: 'true' +data: + gencert_ray.sh: | + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl + + ## Generate tls.cert + openssl x509 -req \ + -in /etc/ray/tls/ca.csr \ + -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ + -days 36500 \ + -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/config/runtimes/vllm-multinode-template.yaml b/config/runtimes/vllm-multinode-template.yaml index 0be788e1..7d6033b4 100644 --- a/config/runtimes/vllm-multinode-template.yaml +++ b/config/runtimes/vllm-multinode-template.yaml @@ -2,8 +2,8 @@ apiVersion: template.openshift.io/v1 kind: Template metadata: labels: - opendatahub.io/dashboard: "true" - opendatahub.io/ootb: "true" + opendatahub.io/dashboard: 'true' + opendatahub.io/ootb: 'true' annotations: description: vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs openshift.io/display-name: vLLM ServingRuntime Multi Node for KServe @@ -12,8 +12,8 @@ metadata: template.openshift.io/documentation-url: https://github.com/opendatahub-io/vllm template.openshift.io/long-description: This template defines resources needed to deploy vLLM ServingRuntime Multi-Node with KServe in Red Hat OpenShift AI opendatahub.io/modelServingSupport: '["single"]' - opendatahub.io/apiProtocol: "REST" - name: vllm-multinode-runtime-template-test + opendatahub.io/apiProtocol: 'REST' + name: vllm-multinode-runtime-template objects: - apiVersion: serving.kserve.io/v1alpha1 kind: ServingRuntime @@ -23,11 +23,11 @@ objects: openshift.io/display-name: vLLM ServingRuntime for KServe opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' labels: - opendatahub.io/dashboard: "true" + opendatahub.io/dashboard: 'true' spec: annotations: - prometheus.io/port: "8080" - prometheus.io/path: "/metrics" + prometheus.io/port: '8080' + prometheus.io/path: '/metrics' multiModel: false supportedModelFormats: - autoSelect: true @@ -36,9 +36,9 @@ objects: containers: - name: kserve-container image: $(vllm-image) - command: [ "bash", "-c" ] + command: ['bash', '-c'] args: - - | + - | # Generate self signed certificate if [[ $RAY_USE_TLS == "1" ]]; then /etc/gen/tls/gencert_ray.sh @@ -54,19 +54,19 @@ objects: export SERVED_MODEL_NAME=${MODEL_NAME} export MODEL_NAME=${MODEL_DIR} - + exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce env: - name: RAY_USE_TLS - value: "1" + value: '1' - name: RAY_TLS_SERVER_CERT - value: "/etc/ray/tls/tls.crt" + value: '/etc/ray/tls/tls.crt' - name: RAY_TLS_SERVER_KEY - value: "/etc/ray/tls/tls.key" + value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: "/etc/ca/tls/ca.crt" + value: '/etc/ca/tls/ca.crt' - name: RAY_PORT - value: "6379" + value: '6379' - name: RAY_ADDRESS value: 127.0.0.1:6379 - name: POD_NAMESPACE @@ -76,25 +76,25 @@ objects: - name: POD_IP valueFrom: fieldRef: - fieldPath: status.podIP + fieldPath: status.podIP - name: VLLM_NO_USAGE_STATS - value: "1" + value: '1' - name: HOME value: /tmp - name: HF_HOME value: /tmp/hf_home resources: limits: - cpu: "16" + cpu: '16' memory: 48Gi requests: - cpu: "8" + cpu: '8' memory: 24Gi volumeMounts: - name: shm mountPath: /dev/shm - mountPath: /etc/ca/tls - name: ca-tls + name: ray-ca-cert readOnly: true - mountPath: /etc/ray/tls name: ray-tls @@ -157,7 +157,7 @@ objects: echo "Unhealthy - Used: ${used_gpu}, Reserved: ${reserved_gpu}" exit 1 fi - + # Check model health health_check=$(curl -o /dev/null -s -w "%{http_code}\n" http://localhost:8080/health) if [[ ${health_check} != 200 ]]; then @@ -182,7 +182,7 @@ objects: echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." exit 1 fi - + # Double check to make sure Model is ready to serve. for i in 1 2; do # Check model health @@ -201,27 +201,27 @@ objects: emptyDir: medium: Memory sizeLimit: 12Gi - - name: ca-tls + - name: ray-ca-cert secret: - secretName: ca-tls + secretName: ray-ca-cert - name: ray-tls - emptyDir: {} + emptyDir: {} # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: tls + name: ray-tls-scripts defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: - - key: gencert_ray.sh - path: gencert_ray.sh + - key: gencert_ray.sh + path: gencert_ray.sh workerSpec: pipelineParallelSize: 2 tensorParallelSize: 1 containers: - name: worker-container image: $(vllm-image) - command: [ "bash", "-c" ] + command: ['bash', '-c'] args: - | # Generate self signed certificate @@ -245,7 +245,7 @@ objects: echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready." echo "For troubleshooting, refer to the FAQ at https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/troubleshooting.html#kuberay-troubleshootin-guides" fi - + sleep 5 done @@ -254,13 +254,13 @@ objects: ray start --address="${RAY_HEAD_ADDRESS}" --block env: - name: RAY_USE_TLS - value: "1" + value: '1' - name: RAY_TLS_SERVER_CERT - value: "/etc/ray/tls/tls.crt" + value: '/etc/ray/tls/tls.crt' - name: RAY_TLS_SERVER_KEY - value: "/etc/ray/tls/tls.key" + value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: "/etc/ca/tls/ca.crt" + value: '/etc/ca/tls/ca.crt' - name: POD_NAME valueFrom: fieldRef: @@ -268,26 +268,26 @@ objects: - name: POD_IP valueFrom: fieldRef: - fieldPath: status.podIP + fieldPath: status.podIP - name: POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace resources: limits: - cpu: "16" + cpu: '16' memory: 48Gi requests: - cpu: "8" + cpu: '8' memory: 24Gi volumeMounts: - name: shm mountPath: /dev/shm - mountPath: /etc/ca/tls - name: ca-tls + name: ray-ca-cert readOnly: true - mountPath: /etc/ray/tls - name: ray-tls + name: ray-tls - mountPath: /etc/gen/tls name: gen-tls-script livenessProbe: @@ -305,7 +305,7 @@ objects: if [[ ${registered_node_count} -ne "${PIPELINE_PARALLEL_SIZE}" ]]; then echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." exit 1 - fi + fi startupProbe: failureThreshold: 40 periodSeconds: 30 @@ -322,7 +322,7 @@ objects: echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." exit 1 fi - + # Double check to make sure Model is ready to serve. for i in 1 2; do # Check model health @@ -337,18 +337,18 @@ objects: - name: shm emptyDir: medium: Memory - sizeLimit: 12Gi + sizeLimit: 12Gi - name: ray-tls - emptyDir: {} - - name: ca-tls + emptyDir: {} + - name: ray-ca-cert secret: - secretName: ca-tls + secretName: ray-ca-cert # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: tls + name: ray-tls-scripts defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: - - key: gencert_ray.sh - path: gencert_ray.sh + - key: gencert_ray.sh + path: gencert_ray.sh diff --git a/controllers/constants/constants.go b/controllers/constants/constants.go index 0c5d07b9..369ccd8a 100644 --- a/controllers/constants/constants.go +++ b/controllers/constants/constants.go @@ -79,3 +79,9 @@ const ( NimRuntimeTemplateName = "nvidia-nim-serving-template" NimPullSecretName = "nvidia-nim-image-pull" ) + +// Ray +const ( + RayCATlsSecretName = "ray-ca-cert" + RayTlsScriptConfigMapName = "ray-tls-scripts" +) diff --git a/controllers/kserve_ray_tls_controller.go b/controllers/kserve_ray_tls_controller.go new file mode 100644 index 00000000..44b1dfa1 --- /dev/null +++ b/controllers/kserve_ray_tls_controller.go @@ -0,0 +1,427 @@ +package controllers + +import ( + "context" + "os" + "reflect" + + "sigs.k8s.io/controller-runtime/pkg/handler" + + "github.com/go-logr/logr" + kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" + "github.com/opendatahub-io/odh-model-controller/controllers/constants" + "github.com/opendatahub-io/odh-model-controller/controllers/utils" + corev1 "k8s.io/api/core/v1" + apierrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// KServeRayTlsReconciler holds the controller configuration. +type KServeRayTlsReconciler struct { + client client.Client + log logr.Logger +} + +func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRayTlsReconciler { + return &KServeRayTlsReconciler{ + client: client, + log: log, + } +} + +// The reconcile logic works as follows: +// ServingRuntime: +// - On creation: The ray-tls-script ConfigMap and ray-ca-cert Secret are created in the respective namespace. +// - On deletion: The ray-tls-script ConfigMap and ray-ca-cert Secret are deleted only when all ServingRuntimes are deleted from the namespace. + +// ConfigMap: +// - When the original ConfigMap is updated in the control namespace (ctrl ns): The ray-tls-script ConfigMap is deleted and recreated in the namespace where multinode ServingRuntimes exist. +// - When the ConfigMap is deleted in the target namespace (target ns): The ray-tls-script ConfigMap is regenerated. + +// Secret: +// - When the original Secret is updated in the control namespace (ctrl ns): The ray-ca-cert Secret is deleted and recreated in the namespace where multinode ServingRuntimes exist. +// - When the Secret is deleted in the target namespace (target ns): The ray-ca-cert Secret is regenerated. +func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := r.log + controllerNs := os.Getenv("POD_NAMESPACE") + areAllServingRuntimesDeletedInNs := false + + var servingRuntimeList kservev1alpha1.ServingRuntimeList + listOptions := &client.ListOptions{ + Namespace: req.Namespace, + } + if err := r.client.List(ctx, &servingRuntimeList, listOptions); err == nil && len(servingRuntimeList.Items) == 0 { + areAllServingRuntimesDeletedInNs = true + } else if err != nil { + return ctrl.Result{}, err + } + + if req.Name == constants.RayTlsScriptConfigMapName { + if req.Namespace == controllerNs { + if !areAllServingRuntimesDeletedInNs { + log.Info("Original Ray TLS scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) + for _, sr := range servingRuntimeList.Items { + if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "ConfigMap"); err != nil { + return ctrl.Result{}, err + } + } + } + } + + if err := r.reconcileRayTlsScriptConfigMap(ctx, log, controllerNs, req.Namespace, areAllServingRuntimesDeletedInNs); err != nil { + return ctrl.Result{}, err + } + } else if req.Name == constants.RayCATlsSecretName { + if req.Namespace == controllerNs { + if !areAllServingRuntimesDeletedInNs { + log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) + for _, sr := range servingRuntimeList.Items { + if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "Secret"); err != nil { + return ctrl.Result{}, err + } + } + } + } + + if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, areAllServingRuntimesDeletedInNs); err != nil { + return ctrl.Result{}, err + } + } else { + sr := &kservev1alpha1.ServingRuntime{} + err := r.client.Get(ctx, req.NamespacedName, sr) + if err != nil && apierrs.IsNotFound(err) { + } else if err != nil { + return ctrl.Result{}, err + } + + // Determine if ServingRuntime matches specific conditions + // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec + // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) + isMultiNodeServingRuntime := sr != nil && sr.Name == "vllm-multinode-runtime" + isRemoved := areAllServingRuntimesDeletedInNs || !isMultiNodeServingRuntime + + // Log and reconcile Ray TLS scripts ConfigMap + err = r.reconcileRayTlsScriptConfigMap(ctx, log, controllerNs, req.Namespace, isRemoved) + if err != nil { + return ctrl.Result{}, err + } + + // Log and reconcile Ray CA Cert Secret + err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, isRemoved) + if err != nil { + return ctrl.Result{}, err + } + + } + + return ctrl.Result{}, nil + +} + +func checkRayTLSResource(objectName string) bool { + return objectName == constants.RayCATlsSecretName || objectName == constants.RayTlsScriptConfigMapName +} + +// reconcileRayTLSResource filters out ConfigMaps and Secrets that do not match the predefined constants: RayCATlsSecretName or RayTlsScriptConfigMapName. +// This ensures that only the relevant ConfigMaps and Secrets for Ray TLS configuration are captured and processed for the servingRuntime. +func reconcileRayTLSResource() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + if _, ok := e.Object.(*kservev1alpha1.ServingRuntime); ok { + return true + } + return false + }, + DeleteFunc: func(e event.DeleteEvent) bool { + if _, ok := e.Object.(*kservev1alpha1.ServingRuntime); ok { + return true + } + objectName := e.Object.GetName() + return checkRayTLSResource(objectName) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + if _, ok := e.ObjectNew.(*kservev1alpha1.ServingRuntime); ok { + return true + } + objectName := e.ObjectNew.GetName() + return checkRayTLSResource(objectName) + }, + } +} + +// SetupWithManager sets up the controller with the Manager. +func (r *KServeRayTlsReconciler) SetupWithManager(mgr ctrl.Manager) error { + builder := ctrl.NewControllerManagedBy(mgr). + For(&kservev1alpha1.ServingRuntime{}). + Owns(&corev1.ConfigMap{}). + Owns(&corev1.Secret{}). + Watches(&corev1.ConfigMap{}, &handler.EnqueueRequestForObject{}). + Watches(&corev1.Secret{}, &handler.EnqueueRequestForObject{}). + WithEventFilter(reconcileRayTLSResource()) + + return builder.Complete(r) +} + +// reconcileRayTlsScriptConfigMap watch ray-tls-scripts configmap in the controller namespace +// and it will create/update/delete ray-tls-scripts configmap in the namespace where multi-node ServingRuntime created +func (r *KServeRayTlsReconciler) reconcileRayTlsScriptConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { + // When original configmap is updated, it does not need to reconcile + if ctrlNs == targetNs { + return nil + } + + log.Info("Reconciling Ray TLS scripts ConfigMap", "name", constants.RayTlsScriptConfigMapName, "namespace", targetNs) + srcConfigMap := &corev1.ConfigMap{} + err := r.client.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: ctrlNs}, srcConfigMap) + if err != nil { + return err + } + + // Create Desired resource + desiredConfigMapResource, err := r.createDesiredConfigMapResource(targetNs, srcConfigMap) + if err != nil { + return err + } + + // Get Existing resource + existingConfigMapResource := &corev1.ConfigMap{} + err = r.client.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: targetNs}, existingConfigMapResource) + if err != nil { + if apierrs.IsNotFound(err) { + existingConfigMapResource = nil + } else { + return err + } + } + + // Process Delta + if err = r.processDeltaConfigMap(ctx, log, desiredConfigMapResource, existingConfigMapResource, srRemoved); err != nil { + return err + } + + return nil +} + +func (r *KServeRayTlsReconciler) createDesiredConfigMapResource(destNs string, srcConfigmap *corev1.ConfigMap) (*corev1.ConfigMap, error) { + desiredConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: srcConfigmap.Name, + Namespace: destNs, + Labels: map[string]string{ + "opendatahub.io/managed": "true", + "app.kubernetes.io/name": "odh-model-controller", + "app.kubernetes.io/component": "kserve", + "app.kubernetes.io/part-of": "odh-model-serving", + "app.kubernetes.io/managed-by": "odh-model-controller", + }, + }, + Data: srcConfigmap.Data, + } + + return desiredConfigMap, nil +} + +func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log logr.Logger, desiredConfigMapResource *corev1.ConfigMap, existingConfigMapResource *corev1.ConfigMap, srRemoved bool) (err error) { + hasChanged := false + + if shouldAddRayConfigMap(existingConfigMapResource, srRemoved) { + hasChanged = true + log.V(1).Info("Delta found", "create", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) + if err = r.client.Create(ctx, desiredConfigMapResource); err != nil { + return err + } + } + + if isUpdatedRayConfigMap(desiredConfigMapResource, existingConfigMapResource) { + hasChanged = true + log.V(1).Info("Delta found", "update", existingConfigMapResource.GetName(), "namespace", existingConfigMapResource.Namespace) + rp := desiredConfigMapResource.DeepCopy() + rp.Labels = existingConfigMapResource.Labels + + if err = r.client.Update(ctx, rp); err != nil { + return err + } + } + + if shouldDeleteRayConfigMap(existingConfigMapResource, srRemoved) { + hasChanged = true + log.V(1).Info("Delta found", "remove", existingConfigMapResource.GetName(), "namespace", existingConfigMapResource.Namespace) + if err = r.client.Delete(ctx, existingConfigMapResource); err != nil { + return err + } + } + + if !hasChanged && !srRemoved { + log.V(1).Info("No delta found", "name", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) + return nil + } + + return nil +} + +func shouldAddRayConfigMap(existingConfigMap *corev1.ConfigMap, srRemoved bool) bool { + return !srRemoved && utils.IsNil(existingConfigMap) +} + +func isUpdatedRayConfigMap(desiredConfigMap *corev1.ConfigMap, existingConfigMap *corev1.ConfigMap) bool { + return utils.IsNotNil(existingConfigMap) && !reflect.DeepEqual(desiredConfigMap.Data, existingConfigMap.Data) +} + +func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, srRemoved bool) bool { + return utils.IsNotNil(existingConfigMap) && srRemoved +} + +// reconcileRayCACertSecret watch ray-ca-cert secret in the controller namespaces +// and it will create/update/delete ray-ca-cert secret in the namespace where multi-node ServingRuntime created +func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { + // When original secret is updated, it does not need to reconcile + if ctrlNs == targetNs { + return nil + } + log.Info("Reconciling Ray CA Cert Secret", "name", constants.RayCATlsSecretName, "namespace", targetNs) + srcSecret := &corev1.Secret{} + err := r.client.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: ctrlNs}, srcSecret) + if err != nil { + return err + } + + // Create Desired resource + desiredSecretResource, err := r.createDesiredSecretResource(targetNs, srcSecret) + if err != nil { + return err + } + + // Get Existing resource + existingSecretResource := &corev1.Secret{} + err = r.client.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: targetNs}, existingSecretResource) + if err != nil { + if apierrs.IsNotFound(err) { + existingSecretResource = nil + } else { + return err + } + } + + // Process Delta + if err = r.processDeltaSecret(ctx, log, desiredSecretResource, existingSecretResource, srRemoved); err != nil { + return err + } + return nil +} + +func (r *KServeRayTlsReconciler) createDesiredSecretResource(destNs string, sourceSecret *corev1.Secret) (*corev1.Secret, error) { + desiredSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: sourceSecret.Name, + Namespace: destNs, + Labels: map[string]string{ + "opendatahub.io/managed": "true", + "app.kubernetes.io/name": "odh-model-controller", + "app.kubernetes.io/component": "kserve", + "app.kubernetes.io/part-of": "odh-model-serving", + "app.kubernetes.io/managed-by": "odh-model-controller", + }, + }, + Data: sourceSecret.Data, + Type: sourceSecret.Type, + } + + return desiredSecret, nil +} + +func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log logr.Logger, desiredSecretResource *corev1.Secret, existingSecretResource *corev1.Secret, srRemoved bool) (err error) { + hasChanged := false + + if shouldAddRaySecret(existingSecretResource, srRemoved) { + hasChanged = true + log.V(1).Info("Delta found", "create", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) + if err = r.client.Create(ctx, desiredSecretResource); err != nil { + return err + } + } + + if isUpdatedRaySecret(desiredSecretResource, existingSecretResource) { + hasChanged = true + log.V(1).Info("Delta found", "update", existingSecretResource.GetName(), "namespace", existingSecretResource.Namespace) + rp := desiredSecretResource.DeepCopy() + rp.Labels = existingSecretResource.Labels + + if err = r.client.Update(ctx, rp); err != nil { + return err + } + } + + if shouldDeletedRaySecret(existingSecretResource, srRemoved) { + hasChanged = true + log.V(1).Info("Delta found", "remove", existingSecretResource.GetName(), "namespace", existingSecretResource.Namespace) + if err = r.client.Delete(ctx, existingSecretResource); err != nil { + return err + } + } + if !hasChanged && !srRemoved { + log.V(1).Info("No delta found", "name", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) + return nil + } + + return nil +} + +func shouldAddRaySecret(existingSecret *corev1.Secret, srRemoved bool) bool { + return !srRemoved && utils.IsNil(existingSecret) +} + +func isUpdatedRaySecret(desiredSecret *corev1.Secret, existingSecret *corev1.Secret) bool { + return utils.IsNotNil(existingSecret) && !reflect.DeepEqual(desiredSecret.Data, existingSecret.Data) +} + +func shouldDeletedRaySecret(existingSecret *corev1.Secret, srRemoved bool) bool { + return utils.IsNotNil(existingSecret) && srRemoved +} + +func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, log logr.Logger, targetNs string, kind string) error { + if kind == "ConfigMap" { + configmap := &corev1.ConfigMap{} + err := r.client.Get(ctx, types.NamespacedName{ + Name: constants.RayTlsScriptConfigMapName, + Namespace: targetNs, + }, configmap) + if err != nil { + if apierrs.IsNotFound(err) { + log.Info("ConfigMap not found, skipping", "name", constants.RayTlsScriptConfigMapName, "namespace", targetNs) + } + return err + } + + log.Info("Deleting ConfigMap", "name", constants.RayTlsScriptConfigMapName, "namespace", targetNs) + err = r.client.Delete(ctx, configmap) + if err != nil { + return err + } + } + + if kind == "Secret" { + secret := &corev1.Secret{} + err := r.client.Get(ctx, types.NamespacedName{ + Name: constants.RayCATlsSecretName, + Namespace: targetNs, + }, secret) + if err != nil { + if apierrs.IsNotFound(err) { + log.Info("Secret not found, skipping", "name", constants.RayCATlsSecretName, "namespace", targetNs) + } + return err + } + + log.Info("Deleting Secret", "name", constants.RayCATlsSecretName, "namespace", targetNs) + err = r.client.Delete(ctx, secret) + if err != nil { + return err + } + } + return nil +} diff --git a/main.go b/main.go index 846bf9a8..5f25776b 100644 --- a/main.go +++ b/main.go @@ -181,6 +181,14 @@ func main() { os.Exit(1) } + if err = (controllers.NewKServeRayTlsReconciler( + mgr.GetClient(), + ctrl.Log.WithName("controllers").WithName("KServeRayTls"))). + SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "KServeRayTls") + os.Exit(1) + } + if monitoringNS != "" { setupLog.Info("Monitoring namespace provided, setting up monitoring controller.") if err = (&controllers.MonitoringReconciler{ From 8c974e3d0203965ed5fa911defdccc3b68523730 Mon Sep 17 00:00:00 2001 From: jooho lee Date: Mon, 25 Nov 2024 17:54:13 -0500 Subject: [PATCH 3/8] add unit test for ray tls Signed-off-by: jooho lee --- controllers/kserve_ray_tls_controller.go | 59 ++-- controllers/kserve_ray_tls_controller_test.go | 238 +++++++++++++ controllers/storageconfig_controller_test.go | 28 -- controllers/suite_test.go | 42 ++- .../configmaps/ray-tls-scripts-updated.yaml | 70 ++++ .../testdata/configmaps/ray-tls-scripts.yaml | 69 ++++ .../deploy/vllm-multinode-servingruntime.yaml | 320 ++++++++++++++++++ .../testdata/secrets/ray-ca-cert-updated.yaml | 13 + controllers/testdata/secrets/ray-ca-cert.yaml | 13 + 9 files changed, 797 insertions(+), 55 deletions(-) create mode 100644 controllers/kserve_ray_tls_controller_test.go create mode 100644 controllers/testdata/configmaps/ray-tls-scripts-updated.yaml create mode 100644 controllers/testdata/configmaps/ray-tls-scripts.yaml create mode 100644 controllers/testdata/deploy/vllm-multinode-servingruntime.yaml create mode 100644 controllers/testdata/secrets/ray-ca-cert-updated.yaml create mode 100644 controllers/testdata/secrets/ray-ca-cert.yaml diff --git a/controllers/kserve_ray_tls_controller.go b/controllers/kserve_ray_tls_controller.go index 44b1dfa1..247741a3 100644 --- a/controllers/kserve_ray_tls_controller.go +++ b/controllers/kserve_ray_tls_controller.go @@ -49,46 +49,46 @@ func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRay func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := r.log controllerNs := os.Getenv("POD_NAMESPACE") - areAllServingRuntimesDeletedInNs := false + removeRayResources := false var servingRuntimeList kservev1alpha1.ServingRuntimeList - listOptions := &client.ListOptions{ - Namespace: req.Namespace, - } - if err := r.client.List(ctx, &servingRuntimeList, listOptions); err == nil && len(servingRuntimeList.Items) == 0 { - areAllServingRuntimesDeletedInNs = true - } else if err != nil { + + if err := r.client.List(ctx, &servingRuntimeList); err != nil { return ctrl.Result{}, err } if req.Name == constants.RayTlsScriptConfigMapName { if req.Namespace == controllerNs { - if !areAllServingRuntimesDeletedInNs { - log.Info("Original Ray TLS scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) - for _, sr := range servingRuntimeList.Items { + log.Info("Original Ray TLS scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) + for _, sr := range servingRuntimeList.Items { + if sr.Name == "vllm-multinode-runtime" { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "ConfigMap"); err != nil { return ctrl.Result{}, err } } } + } else { + removeRayResources = !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) } - if err := r.reconcileRayTlsScriptConfigMap(ctx, log, controllerNs, req.Namespace, areAllServingRuntimesDeletedInNs); err != nil { + if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, removeRayResources); err != nil { return ctrl.Result{}, err } } else if req.Name == constants.RayCATlsSecretName { if req.Namespace == controllerNs { - if !areAllServingRuntimesDeletedInNs { - log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) - for _, sr := range servingRuntimeList.Items { + log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) + for _, sr := range servingRuntimeList.Items { + if sr.Name == "vllm-multinode-runtime" { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "Secret"); err != nil { return ctrl.Result{}, err } } } + } else { + removeRayResources = !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) } - if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, areAllServingRuntimesDeletedInNs); err != nil { + if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, removeRayResources); err != nil { return ctrl.Result{}, err } } else { @@ -103,24 +103,21 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) isMultiNodeServingRuntime := sr != nil && sr.Name == "vllm-multinode-runtime" - isRemoved := areAllServingRuntimesDeletedInNs || !isMultiNodeServingRuntime + removeRayResources := !isMultiNodeServingRuntime // Log and reconcile Ray TLS scripts ConfigMap - err = r.reconcileRayTlsScriptConfigMap(ctx, log, controllerNs, req.Namespace, isRemoved) + err = r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, removeRayResources) if err != nil { return ctrl.Result{}, err } // Log and reconcile Ray CA Cert Secret - err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, isRemoved) + err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, removeRayResources) if err != nil { return ctrl.Result{}, err } - } - return ctrl.Result{}, nil - } func checkRayTLSResource(objectName string) bool { @@ -167,9 +164,9 @@ func (r *KServeRayTlsReconciler) SetupWithManager(mgr ctrl.Manager) error { return builder.Complete(r) } -// reconcileRayTlsScriptConfigMap watch ray-tls-scripts configmap in the controller namespace -// and it will create/update/delete ray-tls-scripts configmap in the namespace where multi-node ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayTlsScriptConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { +// reconcileRayTlsScriptsConfigMap watch ray-tls-scripts configmap in the controller namespace +// and it will create/update/delete ray-tls-scripts configmap in the namespace where multinode ServingRuntime created +func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { // When original configmap is updated, it does not need to reconcile if ctrlNs == targetNs { return nil @@ -277,7 +274,7 @@ func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, srRemoved boo } // reconcileRayCACertSecret watch ray-ca-cert secret in the controller namespaces -// and it will create/update/delete ray-ca-cert secret in the namespace where multi-node ServingRuntime created +// and it will create/update/delete ray-ca-cert secret in the namespace where multinode ServingRuntime created func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { // When original secret is updated, it does not need to reconcile if ctrlNs == targetNs { @@ -425,3 +422,15 @@ func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, } return nil } + +// Determine if ServingRuntime matches specific conditions +// TO-DO upstream Kserve 0.15 will have a new API WorkerSpec +// So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) +func existMultiNodeServingRuntimeInNs(targetNs string, srList kservev1alpha1.ServingRuntimeList) bool { + for _, sr := range srList.Items { + if sr.Namespace == targetNs && sr.Name == "vllm-multinode-runtime" { + return true + } + } + return false +} diff --git a/controllers/kserve_ray_tls_controller_test.go b/controllers/kserve_ray_tls_controller_test.go new file mode 100644 index 00000000..d0d0a0c8 --- /dev/null +++ b/controllers/kserve_ray_tls_controller_test.go @@ -0,0 +1,238 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "time" + + kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "github.com/opendatahub-io/odh-model-controller/controllers/constants" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" +) + +const ( + multinodeServingRuntimePath = "./testdata/deploy/vllm-multinode-servingruntime.yaml" + rayTlsScriptsPath = "./testdata/configmaps/ray-tls-scripts.yaml" + rayTlsScriptsUpdatedPath = "./testdata/configmaps/ray-tls-scripts-updated.yaml" + rayCaCertPath = "./testdata/secrets/ray-ca-cert.yaml" + rayCaCertUpdatedPath = "./testdata/secrets/ray-ca-cert-updated.yaml" +) + +var _ = Describe("KServe Ray TLS controller", func() { + ctx := context.Background() + + Context("when a multinode ServingRuntime created", func() { + It("should create a 'ray-ca-cert' secret and 'ray-tls-scripts' configmap in the namespace where the SR exist", func() { + testNamespace := Namespaces.Create(cli) + testNs := testNamespace.Name + + // Create ray tls resource + rayTlsScriptsConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) + + rayCaCertSecret := &corev1.Secret{} + err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) + Expect(err).NotTo(HaveOccurred()) + rayCaCertSecret.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) + + By("creating multinode ServingRuntime") + multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + multinodeServingRuntime.SetNamespace(testNs) + Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + + _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Context("when a multinode ServingRuntime exists", func() { + var testNs string + + BeforeEach(func() { + testNamespace := Namespaces.Create(cli) + testNs = testNamespace.Name + + // Create ray tls resources + rayTlsScriptsConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) + + rayCaCertSecret := &corev1.Secret{} + err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) + Expect(err).NotTo(HaveOccurred()) + rayCaCertSecret.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) + + // Create a multinode servingruntime + multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + multinodeServingRuntime.SetNamespace(testNs) + Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + + _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should create a 'ray-ca-cert' secret when it is removed manually", func() { + secret := &corev1.Secret{} + err := cli.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: testNs}, secret) + Expect(err).NotTo(HaveOccurred()) + + By("deleting a 'ray-ca-cert' secret in the namespace") + Expect(cli.Delete(ctx, secret)).To(Succeed()) + + // Check if 'ray-ca-cert' secret is recreated + _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + It("should create a 'ray-tls-scripts' configmap when it is removed manually", func() { + configMap := &corev1.ConfigMap{} + err := cli.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: testNs}, configMap) + Expect(err).NotTo(HaveOccurred()) + + By("deleting a 'ray-tls-scripts' configMap in the namespace") + Expect(cli.Delete(ctx, configMap)).To(Succeed()) + + // Check if 'ray-tls-scripts' configmap is recreated + _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + It("should 'ray-tls-scripts' configmap in the namespace when original one updated", func() { + By("updating 'ray-tls-scripts configmap") + rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsUpdatedConfigMap.SetNamespace(WorkingNamespace) + Expect(cli.Update(ctx, rayTlsScriptsUpdatedConfigMap)).Should(Succeed()) + + _, err = waitForConfigMap(cli, WorkingNamespace, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + + // Check if 'ray-tls-scripts' configmap is updated. + Eventually(func() bool { + updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + + return compareConfigMap(rayTlsScriptsUpdatedConfigMap, updatedConfigMapFromTestNs) + + }).WithTimeout(30 * time.Second).Should(BeTrue()) + + }) + It("should update a 'ray-ca-cert' secret in the namespace when original one updated", func() { + By("updating 'ray-ca-cert secret") + rayCaCertUpdatedSecret := &corev1.Secret{} + err := convertToStructuredResource(rayCaCertUpdatedPath, rayCaCertUpdatedSecret) + Expect(err).NotTo(HaveOccurred()) + rayCaCertUpdatedSecret.SetNamespace(WorkingNamespace) + Expect(cli.Update(ctx, rayCaCertUpdatedSecret)).Should(Succeed()) + + _, err = waitForSecret(cli, WorkingNamespace, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + + // Check if 'ray-ca-cert' secert is updated. + Eventually(func() bool { + updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + return compareSecrets(rayCaCertUpdatedSecret, updatedSecretFromTestNs) + }).WithTimeout(30 * time.Second).Should(BeTrue()) + }) + }) + Context("when a multinode ServingRuntime removed", func() { + var testNs string + BeforeEach(func() { + testNamespace := Namespaces.Create(cli) + testNs = testNamespace.Name + + // Create ray tls resources + rayTlsScriptsConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) + + rayCaCertSecret := &corev1.Secret{} + err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) + Expect(err).NotTo(HaveOccurred()) + rayCaCertSecret.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) + + // Create a multinode servingruntime + multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + multinodeServingRuntime.SetNamespace(testNs) + Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + + _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + It("ray resources should not be removed if there is a multinode ServingRuntime in the namespace", func() { + By("creating another multinode servingruntime for test") + // Create another multinode servingruntime + multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + multinodeServingRuntime.SetNamespace(testNs) + multinodeServingRuntime.SetName("another-multinode-servingruntime") + Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + + By("deleting one multinode servingruntime") + Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) + + // Check if all ray resources are NOT removed + _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + }) + It("ray resources should be removed if there is no multinode ServingRuntime in the namespace", func() { + By("deleting a multinode servingruntime") + multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + multinodeServingRuntime.SetNamespace(testNs) + Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) + + // Check if all ray resources are removed + configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).To(HaveOccurred()) + Expect(configmap).To(BeNil()) + + secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).To(HaveOccurred()) + Expect(secret).To(BeNil()) + }) + }) +}) diff --git a/controllers/storageconfig_controller_test.go b/controllers/storageconfig_controller_test.go index 8ab1ea68..d78033a5 100644 --- a/controllers/storageconfig_controller_test.go +++ b/controllers/storageconfig_controller_test.go @@ -19,7 +19,6 @@ import ( "context" "fmt" "log" - "reflect" "time" corev1 "k8s.io/api/core/v1" @@ -266,30 +265,3 @@ func updateSecretLabel(cli client.Client, namespace, secretName string, labelKey return nil } -func waitForSecret(cli client.Client, namespace, secretName string, maxTries int, delay time.Duration) (*corev1.Secret, error) { - time.Sleep(delay) - - ctx := context.Background() - secret := &corev1.Secret{} - for try := 1; try <= maxTries; try++ { - err := cli.Get(ctx, client.ObjectKey{Namespace: namespace, Name: secretName}, secret) - if err == nil { - return secret, nil - } - if !apierrs.IsNotFound(err) { - return nil, fmt.Errorf("failed to get secret %s/%s: %v", namespace, secretName, err) - } - - if try < maxTries { - time.Sleep(1 * time.Second) - return nil, err - } - } - return secret, nil -} - -// compareSecrets checks if two Secret data are equal, if not return false -func compareSecrets(s1 *corev1.Secret, s2 *corev1.Secret) bool { - // Two Secret will be equal if the data is identical - return reflect.DeepEqual(s1.Data, s2.Data) -} diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 037e2f2f..1d7567a6 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -21,6 +21,7 @@ import ( "math/rand" "os" "path/filepath" + "reflect" "testing" "time" @@ -163,6 +164,7 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) + // add reconcilers err = (NewOpenshiftInferenceServiceReconciler( mgr.GetClient(), mgr.GetAPIReader(), @@ -196,6 +198,12 @@ var _ = BeforeSuite(func() { }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) + err = (&KServeRayTlsReconciler{ + client: cli, + log: ctrl.Log.WithName("controllers").WithName("KServe-Ray-TLS-Controller"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + kclient, _ := kubernetes.NewForConfig(cfg) err = (&NimAccountReconciler{ Client: cli, @@ -321,10 +329,40 @@ func waitForConfigMap(cli client.Client, namespace, configMapName string, maxTri return nil, fmt.Errorf("failed to get configmap %s/%s: %v", namespace, configMapName, err) } - if try > maxTries { + if try < maxTries { time.Sleep(1 * time.Second) - return nil, err + } else { + return nil, fmt.Errorf("namespace: %s, err: %v", namespace, err) } } return configMap, nil } + +func waitForSecret(cli client.Client, namespace, secretName string, maxTries int, delay time.Duration) (*corev1.Secret, error) { + time.Sleep(delay) + + ctx := context.Background() + secret := &corev1.Secret{} + for try := 1; try <= maxTries; try++ { + err := cli.Get(ctx, client.ObjectKey{Namespace: namespace, Name: secretName}, secret) + if err == nil { + return secret, nil + } + if !apierrs.IsNotFound(err) { + return nil, fmt.Errorf("failed to get secret %s/%s: %v", namespace, secretName, err) + } + + if try < maxTries { + time.Sleep(1 * time.Second) + } else { + return nil, err + } + } + return secret, nil +} + +// compareSecrets checks if two Secret data are equal, if not return false +func compareSecrets(s1 *corev1.Secret, s2 *corev1.Secret) bool { + // Two Secret will be equal if the data is identical + return reflect.DeepEqual(s1.Data, s2.Data) +} diff --git a/controllers/testdata/configmaps/ray-tls-scripts-updated.yaml b/controllers/testdata/configmaps/ray-tls-scripts-updated.yaml new file mode 100644 index 00000000..77491ceb --- /dev/null +++ b/controllers/testdata/configmaps/ray-tls-scripts-updated.yaml @@ -0,0 +1,70 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-tls-scripts + labels: + opendatahub.io/managed: 'true' +data: + gencert_ray.sh: | + UPDATED + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl + + ## Generate tls.cert + openssl x509 -req \ + -in /etc/ray/tls/ca.csr \ + -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ + -days 36500 \ + -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/controllers/testdata/configmaps/ray-tls-scripts.yaml b/controllers/testdata/configmaps/ray-tls-scripts.yaml new file mode 100644 index 00000000..df4c00ce --- /dev/null +++ b/controllers/testdata/configmaps/ray-tls-scripts.yaml @@ -0,0 +1,69 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-tls-scripts + labels: + opendatahub.io/managed: 'true' +data: + gencert_ray.sh: | + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl + + ## Generate tls.cert + openssl x509 -req \ + -in /etc/ray/tls/ca.csr \ + -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ + -days 36500 \ + -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml b/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml new file mode 100644 index 00000000..acfeb4fc --- /dev/null +++ b/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml @@ -0,0 +1,320 @@ + +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + name: vllm-multinode-runtime + namespace: default +spec: + annotations: + prometheus.io/port: '8080' + prometheus.io/path: '/metrics' + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + priority: 2 + containers: + - name: kserve-container + image: quay.io/opendatahub/vllm:fast + command: ['bash', '-c'] + args: + - | + # Generate self signed certificate + if [[ $RAY_USE_TLS == "1" ]]; then + /etc/gen/tls/gencert_ray.sh + fi + ray start --head --disable-usage-stats --include-dashboard false + # Wait for other node to join + until [[ $(ray status --address ${RAY_ADDRESS} | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do + echo "Waiting..." + sleep 1 + done + ray status --address ${RAY_ADDRESS + export SERVED_MODEL_NAME=${MODEL_NAME} + export MODEL_NAME=${MODEL_DIR} + exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce + env: + - name: RAY_USE_TLS + value: '1' + - name: RAY_TLS_SERVER_CERT + value: '/etc/ray/tls/tls.crt' + - name: RAY_TLS_SERVER_KEY + value: '/etc/ray/tls/tls.key' + - name: RAY_TLS_CA_CERT + value: '/etc/ca/tls/ca.crt' + - name: RAY_PORT + value: '6379' + - name: RAY_ADDRESS + value: 127.0.0.1:6379 + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NO_USAGE_STATS + value: '1' + - name: HOME + value: /tmp + - name: HF_HOME + value: /tmp/hf_home + resources: + limits: + cpu: '16' + memory: 48Gi + requests: + cpu: '8' + memory: 24Gi + volumeMounts: + - name: shm + mountPath: /dev/shm + - mountPath: /etc/ca/tls + name: ray-ca-cert + readOnly: true + - mountPath: /etc/ray/tls + name: ray-tls + - mountPath: /etc/gen/tls + name: gen-tls-script + livenessProbe: + failureThreshold: 2 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 15 + exec: + command: + - bash + - -c + - | + # Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE + gpu_status=$(ray status --address ${RAY_ADDRESS} | grep GPU) + if [[ -z ${gpu_status} ]]; then + echo "Unhealthy - GPU does not exist" + exit 1 + f + used_gpu=$(echo "${gpu_status}" | awk '{print $1}' | cut -d'/' -f1) + reserved_gpu=$(echo "${gpu_status}" | awk '{print $1}' | cut -d'/' -f2 + # Determine health status based on GPU usage + if [[ "${used_gpu}" != "${reserved_gpu}" ]]; then + echo "Unhealthy - Used: ${used_gpu}, Reserved: ${reserved_gpu}" + exit 1 + fi + readinessProbe: + failureThreshold: 2 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 15 + exec: + command: + - bash + - -c + - | + # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE + registered_node_count=$(ray status --address ${RAY_ADDRESS} | grep -c node_) + if [[ ${registered_node_count} -ne "${PIPELINE_PARALLEL_SIZE}" ]]; then + echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." + exit 1 + f + # Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE + gpu_status=$(ray status --address ${RAY_ADDRESS} | grep GPU) + if [[ -z ${gpu_status} ]]; then + echo "Unhealthy - GPU does not exist" + exit 1 + f + used_gpu=$(echo "${gpu_status}" | awk '{print $1}' | cut -d'/' -f1) + reserved_gpu=$(echo "${gpu_status}" | awk '{print $1}' | cut -d'/' -f2 + # Determine health status based on GPU usage + if [[ "${used_gpu}" != "${reserved_gpu}" ]]; then + echo "Unhealthy - Used: ${used_gpu}, Reserved: ${reserved_gpu}" + exit 1 + f + # Check model health + health_check=$(curl -o /dev/null -s -w "%{http_code}\n" http://localhost:8080/health) + if [[ ${health_check} != 200 ]]; then + echo "Unhealthy - vLLM Runtime Health Check failed." + exit 1 + fi + startupProbe: + failureThreshold: 40 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 30 + initialDelaySeconds: 20 + exec: + command: + - bash + - -c + - | + # This need when head node have issues and restarted. + # It will wait for new worker node. + registered_node_count=$(ray status --address ${RAY_ADDRESS} | grep -c node_) + if [[ ${registered_node_count} -ne "${PIPELINE_PARALLEL_SIZE}" ]]; then + echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." + exit 1 + f + # Double check to make sure Model is ready to serve. + for i in 1 2; do + # Check model health + health_check=$(curl -o /dev/null -s -w "%{http_code}\n" http://localhost:8080/health) + if [[ ${health_check} != 200 ]]; then + echo "Unhealthy - vLLM Runtime Health Check failed." + exit 1 + fi + done + ports: + - containerPort: 8080 + name: http + protocol: TCP + volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: 12Gi + - name: ray-ca-cert + secret: + secretName: ray-ca-cert + - name: ray-tls + emptyDir: {} + # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional + - name: gen-tls-script + configMap: + name: ray-tls-scripts + defaultMode: 0777 + # An array of keys from the ConfigMap to create as files + items: + - key: gencert_ray.sh + path: gencert_ray.sh + workerSpec: + pipelineParallelSize: 2 + tensorParallelSize: 1 + containers: + - name: worker-container + image: quay.io/opendatahub/vllm:fast + command: ['bash', '-c'] + args: + - | + # Generate self signed certificate + if [[ $RAY_USE_TLS == "1" ]]; then + /etc/gen/tls/gencert_ray.sh + fi + SECONDS= + while true; do + if (( SECONDS <= 240 )); then + if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then + echo "Global Control Service(GCS) is ready." + break + fi + echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready." + else + if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then + echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored." + break + fi + echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready." + echo "For troubleshooting, refer to the FAQ at https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/troubleshooting.html#kuberay-troubleshootin-guides" + f + sleep 5 + don + export RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" + echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..." + ray start --address="${RAY_HEAD_ADDRESS}" --block + env: + - name: RAY_USE_TLS + value: '1' + - name: RAY_TLS_SERVER_CERT + value: '/etc/ray/tls/tls.crt' + - name: RAY_TLS_SERVER_KEY + value: '/etc/ray/tls/tls.key' + - name: RAY_TLS_CA_CERT + value: '/etc/ca/tls/ca.crt' + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + limits: + cpu: '16' + memory: 48Gi + requests: + cpu: '8' + memory: 24Gi + volumeMounts: + - name: shm + mountPath: /dev/shm + - mountPath: /etc/ca/tls + name: ray-ca-cert + readOnly: true + - mountPath: /etc/ray/tls + name: ray-tls + - mountPath: /etc/gen/tls + name: gen-tls-script + livenessProbe: + failureThreshold: 2 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 15 + exec: + command: + - bash + - -c + - | + # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE + registered_node_count=$(ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 | grep -c node_) + if [[ ${registered_node_count} -ne "${PIPELINE_PARALLEL_SIZE}" ]]; then + echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." + exit 1 + fi + startupProbe: + failureThreshold: 40 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 30 + initialDelaySeconds: 20 + exec: + command: + - /bin/sh + - -c + - | + registered_node_count=$(ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 | grep -c node_) + if [[ ${registered_node_count} -ne "${PIPELINE_PARALLEL_SIZE}" ]]; then + echo "Unhealthy - Registered nodes count (${registered_node_count}) does not match PIPELINE_PARALLEL_SIZE (${PIPELINE_PARALLEL_SIZE})." + exit 1 + fi + # Double check to make sure Model is ready to serve. + for i in 1 2; do + # Check model health + model_health_check=$(curl -s ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:8080/v1/models|grep -o ${ISVC_NAME}) + if [[ ${model_health_check} != "${ISVC_NAME}" ]]; then + echo "Unhealthy - vLLM Runtime Health Check failed." + exit 1 + fi + sleep 10 + done + volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: 12Gi + - name: ray-tls + emptyDir: {} + - name: ray-ca-cert + secret: + secretName: ray-ca-cert + # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional + - name: gen-tls-script + configMap: + name: ray-tls-scripts + defaultMode: 0777 + # An array of keys from the ConfigMap to create as files + items: + - key: gencert_ray.sh + path: gencert_ray.s diff --git a/controllers/testdata/secrets/ray-ca-cert-updated.yaml b/controllers/testdata/secrets/ray-ca-cert-updated.yaml new file mode 100644 index 00000000..b590c941 --- /dev/null +++ b/controllers/testdata/secrets/ray-ca-cert-updated.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ray-ca-cert + labels: + opendatahub.io/managed: 'true' +data: + # output from cat ca.crt | base64 + ca.crt: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT1URVNUCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + # output from cat ca.key | base64 + ca.key: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT1URVNUCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K diff --git a/controllers/testdata/secrets/ray-ca-cert.yaml b/controllers/testdata/secrets/ray-ca-cert.yaml new file mode 100644 index 00000000..77335ebf --- /dev/null +++ b/controllers/testdata/secrets/ray-ca-cert.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ray-ca-cert + labels: + opendatahub.io/managed: 'true' +data: + # output from cat ca.crt | base64 + ca.crt: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + # output from cat ca.key | base64 + ca.key: | + LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUpRd0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQ1Mwd2dna3BBZ0VBQW9JQ0FRQy95SW1GbVpodUFqWS8KZjkrY3hBV3NqSnY0SE1ZU0J6L29FdkNqZEw2RjFvdnJ6Q2NGdzBIY0RnVGJSV1pHTGMyU0xLQXg3aHM3WU93MQplSzNyemVEU1BMU1RZT3FHMzVDbEtYZThjS09MQjdyTk1COCtYcVhaTHRQREZ1YkhWVFQwa05oNGxjVzQyVmlhClVHU0duOWdIU1pHWlQxcGs5R1NtVHJVQSs0OWlPSnQ3Z2NabkhQRXVZVytpV1dxREpYRkZNWGVOWmFaK3pwbTkKR2xlQUQzMjJQNkhQcVgzZ3JON3FaNm81V0kzaFFlTHhPY1dER0ZnRE0vaUE0Y1JPS2cza1lyQ3djV3UyUDZVMAphdXZYbWxMTXdmY3NRK3hnS3lOcFIvS3ovbE1ZeFdpTHZwQUZlWnArL1BjSHMvTFMrUmVBUG1VUGFYUFRFZWtLClozWFk4bVJNUDZXWGJ1OUZZKzJBT2RUcmlvOUt0YjdaZThEenFkRTdNcVQvR2Jwdkh4VWVKeVozRG9WeXpIc0oKV2pKUU1WV1VIVzlBTXJRT2pDTGJWbGlkUGY2US9YOHlSbkJiemY1L1lPbFlDZmV0MnAzWVN5OW0wUzdCQ1JaVQp6QTd0aDhwZ2VnUHFCWHFlM0hyS3ZTU0ExM1ZXb3hyS3RvZm5tVHVJS21KNEYzSHdHTW5pSVhQVitxb1RuZjFZCm1IK3NoWWZkNTBiVnkwRExnVU8ydVdOYkJFc0E4OEdHeWJTcC9ySktZZlJ5Z2RlMUJCRFVxVE9LMGVNS3N3ejgKS1NYMk1wTDVHOXNKZURRZ2FJM0lqbTZldEZud2d2a0t0WmVDZkZNMGZDYUdXbXdnZC9aVkthU1pEbytVS202dApOQ1lnTEZ3andXY1BQb0FicmFHd0ZDdUJTSTlIandJREFRQUJBb0lDQUFyYzkwaG9ud3VIWGI3ZmNtU0IxU3JZClZPWWt1WDl6aHQvRWxIb1E5cDNFSSswNWhWaFdCTmpMNjBvYXRuRlhtenk3emZtTWMyRTcyemlPam1OdmpvOGcKY1l4eDlMYmQycG5RWUlBWEJ0eDV5UUxJWUFaSUwySys3NjloRUlLYksvVzQxZG9wN05vekFMQm9MMW1FenlSZgpWS0hFU0ZDMHptS3hNOUpMYllYeWowMm9QbUhBY0NHdGJHdjFrZGZ4RkdjNldrZy80c0tnY05ld3NueUdTb0lICm8zd21ZSnkvSjUxTDF5QlhPL2J2Y1hobHNMd3djamNCQ0FNUUU0aE42UjJKUUwrdDBEWGt2SjBQcnZzRE9wa0kKakdzTlEzMWVPcEpERmdwL21zNlFNWnpObHhwdXNGQTVnNUNkaUpRMHNkSGpOdUtqTXhyeUxKRk1HY0l0OExEQwpRVzF2akxLR0l1UWtraGwxOWU1S1N2SDdjUjJja0pDME5vTzhnekpudzd0dTRGaHJaK0xQeXF2R3VSYU55a2RmCi9BKzNEOUE2RW1PNWRldFU2RzJkK0l2TmprdG91Z05UalZIUklDbk9oL01zRmlFQXdycHltVVNISzhKTjVpSjIKUm1rNFljNWlXUjhOUWs0Wkh6aVFGSHJSWkh0TW9DcEkvR2ZGcnYyRVE0bFpOOG5tZHdDWDR4a3JObUJ2ZnlIdgpLWW0yMU5VWDc5U3lRbHd5VS9lNUR6eTA3Si9zcTdoVU8xN3hVcXNzTVpZTGZCSFF0VFU0VVAwbnBOZmtxUFM1CjdJRUtIVWwyRlZudXR0THoxc0ZVVHhJTS90aE9lczRtWElrOExiYzI0Yk45VWlteXplVEN1bE83a0hZSDhTVkEKZDJqZFBTZXhZSTdMeWFVNnFHRzVBb0lCQVFEbHlVQk5CaTRNekdxVnh5NjNCY1dyZC9rdVYrYTFLQ3MyYVhzagpLbVlMT0xrSkhUSjI0YU9EWkJBVGxEMllwclZEOUM1UThTeDdPQVdlQ3FqWHd1MndsOTlabXNIVkxiQTZMRUZ6CnBoYTNQVHhkaWFpMElwZVY2ZFpIQnQrdjVDVGsvSnpxSVpjc1J2REFnNTFHYzgxbERxTzFNbnVqbldBcGpSMmMKd05ZVXd6a3hicHVTc1ZHYzFBZ09tVHBHN1MrdWVVQ2FGU0NVaGkyVWVoblM5dkNrU3Y0QTZMRlpiaXhFeWp6aApycU9mN1d1TTVUWkFoTGM2RTJUQnVOeWJlWW9DblFMdHF3dnUxaFhDOGU4TGlQWFRlMVJ4U2x5dXA5RDhiWEZBCjVPVmFUZjAzcFFweURiOXNKeGhLN3FMbUgrSjlUeU5JamhTTUZQT2pKNEJFRTlPOUFvSUJBUURWcVhDMGdCVzUKYlNUWmUzc3l1QVltRi9hVDg1ZFh1NGFTMFBJR09MakE1M2h0RVdLUkJxd1JlU1prSFdtR05uNUIyOXVXTHg2UgpPZjFNOFJkY2NYSnlxMnp1TlBiWkpabllwS0x5N0FjeDBpc1RvMjdpUy9xRS85YndsNUo3QVU5UmZ2K2ZMK2RPCmxqUndRTGUvQ1dSVHVlTlNOSWpPUC96NWRra2J5Z1kvWHZHbmI0RUJheDY4K3J2a0NYbStGdFpXV3VoblM2Uy8KZHh3Ulo2VGRMd09RZTZQSzNzN3F4c2xWNmQ2dmwrSUpwa1VVZmRvWDNyWFlTeGx2cFlQYWJpWEpaVzdQWkZwRQpVQXc0VTFpSzVLMUt5d1ZjaHlhN2tQSlpRNUplS1pUT3lPL1d5ODZLak0vcUd3NUhDR2NOL2VMbDJKUUViUkwvClJiR0pGSmhUalpjN0FvSUJBQXlyNG0zYzcyRXBUSjloMG9PcFA5TksxR1RuMkFNWmFmaWdMSGd0K0Y2YURDb2kKZ0F2cU9YZ2ZabnVONnkrbDBjMGpoQUpXcWx0SkpaWW5oRlFSbmNYbE9oM1kyT09HbDNjOXhZWTVISHVTVnVmWgpsWUlKZms1NERLYnlEQmZJL3ZmWnJsV0M4TEV5WUVoZGVhak83ZjZxcGdCeC9qdHhqRUgrVkNtMndKZDRoSWpqClRwVHlUa3ZWclhRUW94UVNORlRzdnRGQVpRR0x2S3U1Wi84b092RDBhYmxuRzVDUThNUUNXd1VlK2tyeGJzTGcKU1BPWjNmakg1UUNCenppTHBUNnJwZU94VVFFa3NTS0U4T2V6NzhwdnZLSmF0VzIwTjJRVUxQQ2xMcmlpSUZxWApNVkpFeTgrTkFGdnhlTzR6eCt1ZEY1Y0Nyc05pekdTczR2ZmVHQWtDZ2dFQkFMdFRnbWdPd0gxQlR3U0t1Ym4vCkZBMEVCNEV5R2FlbTExY1RjSTY1M21ucXgyL0F4VTFucnlibXRCMGttR2Mra2JYR1FDRE5rUnc4M25NK0VZQlEKU3NwMHQ5MmxmQ05vVHhsZFJ5eDZlZGhaYnNFYUVsYS96SllkQk9NTjBUU2RNbUMrV3ZuRGN5WTRsU014NnFmSQpZVGp6Q25ZQmIweDlWNXVUOUljenVnU0hocEdKTm03Njd3azdQODZ2N0JnWVI3V1FvS0FuOXZxVFFIMldCRHFVClJLakJiaHFvL0h0azdCS3lLRGFGa0gxclZMZWhtN3cvMitrVjl1Z25FcEpJN2tKRDkwSkh0c2liOGdyVU1CWWUKWmp6a0FRQmQwanl5MlhnZndVMWpZWDluTnJoNUdjM3BwVVNZa2d6L05mTlRmRUtPZnovZUxjQzM1dTdMcXIzZQpydzhDZ2dFQkFMT2tsTkJNRVBmM20yTXBjaVRjRmNKb08vZzBMUUpHaTJtWkN6S1g3eDJFS0N2N1ZvVWVtRkk0CjRkRFVmSlBJWlBFTUpkTHRSUy9qUDEyZWkxek9lWHIrVGlUTklpUUVoemRtL0RZWUdjd2hyb0xLNDZVTFJKY0YKYzdxZ2xNQ1Z1MW9DTmtDdTJvZ08renczRm9makJzK1pqcE1BS2kyOTZ2ZDk2YVlYNThYR0RKekdmdjhuZEF1dwpEUmU1ZE5oQU5iaHZqSlM1VXJwNnhoMVMycTNYOHorTlFGWW9CNDM1Q2NXNW50WWMzemIxYVdzY0NxMWJsUGJGCjc0QTFLTHJNNlpvU0ZlcUVWZzhvajhpWjlDaitiTTJXYm9BREIvRTROM0kyNmFDK1dDRWxtdTd3ZDdQaExQT2IKN3RrTXh2Zm10dDE5T2dYbTRKZm9SZWlkMTNYbHFoZz0KLS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQo= From d3e769d5d315fcc8afc8eff81222269165e1883c Mon Sep 17 00:00:00 2001 From: jooho lee Date: Mon, 25 Nov 2024 18:41:19 -0500 Subject: [PATCH 4/8] enhance logic Signed-off-by: jooho lee --- controllers/kserve_ray_tls_controller.go | 82 +++++++++--------------- 1 file changed, 31 insertions(+), 51 deletions(-) diff --git a/controllers/kserve_ray_tls_controller.go b/controllers/kserve_ray_tls_controller.go index 247741a3..0e5f81ad 100644 --- a/controllers/kserve_ray_tls_controller.go +++ b/controllers/kserve_ray_tls_controller.go @@ -49,70 +49,57 @@ func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRay func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := r.log controllerNs := os.Getenv("POD_NAMESPACE") - removeRayResources := false - var servingRuntimeList kservev1alpha1.ServingRuntimeList - if err := r.client.List(ctx, &servingRuntimeList); err != nil { return ctrl.Result{}, err } + noMultiNodeSrExists := !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) if req.Name == constants.RayTlsScriptConfigMapName { if req.Namespace == controllerNs { log.Info("Original Ray TLS scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) for _, sr := range servingRuntimeList.Items { + // Determine if ServingRuntime matches specific conditions + // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec + // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) if sr.Name == "vllm-multinode-runtime" { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "ConfigMap"); err != nil { return ctrl.Result{}, err } } } - } else { - removeRayResources = !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) } - if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, removeRayResources); err != nil { + if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExists); err != nil { return ctrl.Result{}, err } } else if req.Name == constants.RayCATlsSecretName { if req.Namespace == controllerNs { log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) for _, sr := range servingRuntimeList.Items { + // Determine if ServingRuntime matches specific conditions + // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec + // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) if sr.Name == "vllm-multinode-runtime" { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "Secret"); err != nil { return ctrl.Result{}, err } } } - } else { - removeRayResources = !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) } - if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, removeRayResources); err != nil { + if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExists); err != nil { return ctrl.Result{}, err } } else { - sr := &kservev1alpha1.ServingRuntime{} - err := r.client.Get(ctx, req.NamespacedName, sr) - if err != nil && apierrs.IsNotFound(err) { - } else if err != nil { - return ctrl.Result{}, err - } - - // Determine if ServingRuntime matches specific conditions - // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec - // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) - isMultiNodeServingRuntime := sr != nil && sr.Name == "vllm-multinode-runtime" - removeRayResources := !isMultiNodeServingRuntime - // Log and reconcile Ray TLS scripts ConfigMap - err = r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, removeRayResources) + err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) if err != nil { return ctrl.Result{}, err } // Log and reconcile Ray CA Cert Secret - err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, removeRayResources) + err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) if err != nil { return ctrl.Result{}, err } @@ -166,7 +153,7 @@ func (r *KServeRayTlsReconciler) SetupWithManager(mgr ctrl.Manager) error { // reconcileRayTlsScriptsConfigMap watch ray-tls-scripts configmap in the controller namespace // and it will create/update/delete ray-tls-scripts configmap in the namespace where multinode ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { +func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultinodeSRExistInNs bool) error { // When original configmap is updated, it does not need to reconcile if ctrlNs == targetNs { return nil @@ -197,7 +184,7 @@ func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Con } // Process Delta - if err = r.processDeltaConfigMap(ctx, log, desiredConfigMapResource, existingConfigMapResource, srRemoved); err != nil { + if err = r.processDeltaConfigMap(ctx, log, desiredConfigMapResource, existingConfigMapResource, noMultinodeSRExistInNs); err != nil { return err } @@ -223,10 +210,10 @@ func (r *KServeRayTlsReconciler) createDesiredConfigMapResource(destNs string, s return desiredConfigMap, nil } -func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log logr.Logger, desiredConfigMapResource *corev1.ConfigMap, existingConfigMapResource *corev1.ConfigMap, srRemoved bool) (err error) { +func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log logr.Logger, desiredConfigMapResource *corev1.ConfigMap, existingConfigMapResource *corev1.ConfigMap, noMultinodeSRExistInNs bool) (err error) { hasChanged := false - if shouldAddRayConfigMap(existingConfigMapResource, srRemoved) { + if shouldAddRayConfigMap(existingConfigMapResource, noMultinodeSRExistInNs) { hasChanged = true log.V(1).Info("Delta found", "create", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) if err = r.client.Create(ctx, desiredConfigMapResource); err != nil { @@ -238,14 +225,13 @@ func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log hasChanged = true log.V(1).Info("Delta found", "update", existingConfigMapResource.GetName(), "namespace", existingConfigMapResource.Namespace) rp := desiredConfigMapResource.DeepCopy() - rp.Labels = existingConfigMapResource.Labels if err = r.client.Update(ctx, rp); err != nil { return err } } - if shouldDeleteRayConfigMap(existingConfigMapResource, srRemoved) { + if shouldDeleteRayConfigMap(existingConfigMapResource, noMultinodeSRExistInNs) { hasChanged = true log.V(1).Info("Delta found", "remove", existingConfigMapResource.GetName(), "namespace", existingConfigMapResource.Namespace) if err = r.client.Delete(ctx, existingConfigMapResource); err != nil { @@ -253,7 +239,7 @@ func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log } } - if !hasChanged && !srRemoved { + if !hasChanged && !noMultinodeSRExistInNs { log.V(1).Info("No delta found", "name", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) return nil } @@ -261,21 +247,19 @@ func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log return nil } -func shouldAddRayConfigMap(existingConfigMap *corev1.ConfigMap, srRemoved bool) bool { - return !srRemoved && utils.IsNil(existingConfigMap) +func shouldAddRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultinodeSRExistInNs bool) bool { + return !noMultinodeSRExistInNs && utils.IsNil(existingConfigMap) } - func isUpdatedRayConfigMap(desiredConfigMap *corev1.ConfigMap, existingConfigMap *corev1.ConfigMap) bool { return utils.IsNotNil(existingConfigMap) && !reflect.DeepEqual(desiredConfigMap.Data, existingConfigMap.Data) } - -func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, srRemoved bool) bool { - return utils.IsNotNil(existingConfigMap) && srRemoved +func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultinodeSRExistInNs bool) bool { + return utils.IsNotNil(existingConfigMap) && noMultinodeSRExistInNs } // reconcileRayCACertSecret watch ray-ca-cert secret in the controller namespaces // and it will create/update/delete ray-ca-cert secret in the namespace where multinode ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, srRemoved bool) error { +func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultinodeSRExistInNs bool) error { // When original secret is updated, it does not need to reconcile if ctrlNs == targetNs { return nil @@ -305,7 +289,7 @@ func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, l } // Process Delta - if err = r.processDeltaSecret(ctx, log, desiredSecretResource, existingSecretResource, srRemoved); err != nil { + if err = r.processDeltaSecret(ctx, log, desiredSecretResource, existingSecretResource, noMultinodeSRExistInNs); err != nil { return err } return nil @@ -331,10 +315,10 @@ func (r *KServeRayTlsReconciler) createDesiredSecretResource(destNs string, sour return desiredSecret, nil } -func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log logr.Logger, desiredSecretResource *corev1.Secret, existingSecretResource *corev1.Secret, srRemoved bool) (err error) { +func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log logr.Logger, desiredSecretResource *corev1.Secret, existingSecretResource *corev1.Secret, noMultinodeSRExistInNs bool) (err error) { hasChanged := false - if shouldAddRaySecret(existingSecretResource, srRemoved) { + if shouldAddRaySecret(existingSecretResource, noMultinodeSRExistInNs) { hasChanged = true log.V(1).Info("Delta found", "create", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) if err = r.client.Create(ctx, desiredSecretResource); err != nil { @@ -346,38 +330,34 @@ func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log log hasChanged = true log.V(1).Info("Delta found", "update", existingSecretResource.GetName(), "namespace", existingSecretResource.Namespace) rp := desiredSecretResource.DeepCopy() - rp.Labels = existingSecretResource.Labels if err = r.client.Update(ctx, rp); err != nil { return err } } - if shouldDeletedRaySecret(existingSecretResource, srRemoved) { + if shouldDeletedRaySecret(existingSecretResource, noMultinodeSRExistInNs) { hasChanged = true log.V(1).Info("Delta found", "remove", existingSecretResource.GetName(), "namespace", existingSecretResource.Namespace) if err = r.client.Delete(ctx, existingSecretResource); err != nil { return err } } - if !hasChanged && !srRemoved { + if !hasChanged && !noMultinodeSRExistInNs { log.V(1).Info("No delta found", "name", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) return nil } - return nil } -func shouldAddRaySecret(existingSecret *corev1.Secret, srRemoved bool) bool { - return !srRemoved && utils.IsNil(existingSecret) +func shouldAddRaySecret(existingSecret *corev1.Secret, noMultinodeSRExistInNs bool) bool { + return !noMultinodeSRExistInNs && utils.IsNil(existingSecret) } - func isUpdatedRaySecret(desiredSecret *corev1.Secret, existingSecret *corev1.Secret) bool { return utils.IsNotNil(existingSecret) && !reflect.DeepEqual(desiredSecret.Data, existingSecret.Data) } - -func shouldDeletedRaySecret(existingSecret *corev1.Secret, srRemoved bool) bool { - return utils.IsNotNil(existingSecret) && srRemoved +func shouldDeletedRaySecret(existingSecret *corev1.Secret, noMultinodeSRExistInNs bool) bool { + return utils.IsNotNil(existingSecret) && noMultinodeSRExistInNs } func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, log logr.Logger, targetNs string, kind string) error { From 63c9e3050ee777345dbf365bc321a00ee2fb0185 Mon Sep 17 00:00:00 2001 From: jooho lee Date: Mon, 25 Nov 2024 19:05:27 -0500 Subject: [PATCH 5/8] add more unit tests Signed-off-by: jooho lee --- controllers/kserve_ray_tls_controller_test.go | 88 +++++++++++++------ 1 file changed, 60 insertions(+), 28 deletions(-) diff --git a/controllers/kserve_ray_tls_controller_test.go b/controllers/kserve_ray_tls_controller_test.go index d0d0a0c8..1dc996a0 100644 --- a/controllers/kserve_ray_tls_controller_test.go +++ b/controllers/kserve_ray_tls_controller_test.go @@ -39,23 +39,23 @@ var _ = Describe("KServe Ray TLS controller", func() { ctx := context.Background() Context("when a multinode ServingRuntime created", func() { - It("should create a 'ray-ca-cert' secret and 'ray-tls-scripts' configmap in the namespace where the SR exist", func() { + It("should create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the namespace where the SR exist", func() { testNamespace := Namespaces.Create(cli) testNs := testNamespace.Name - - // Create ray tls resource + + // Create ray tls resources rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) - + rayCaCertSecret := &corev1.Secret{} err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) Expect(err).NotTo(HaveOccurred()) rayCaCertSecret.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - + By("creating multinode ServingRuntime") multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) @@ -103,19 +103,36 @@ var _ = Describe("KServe Ray TLS controller", func() { Expect(err).NotTo(HaveOccurred()) }) - It("should create a 'ray-ca-cert' secret when it is removed manually", func() { + It("should create a 'ray-ca-cert' Secret when it is removed manually", func() { secret := &corev1.Secret{} err := cli.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: testNs}, secret) Expect(err).NotTo(HaveOccurred()) - By("deleting a 'ray-ca-cert' secret in the namespace") + By("deleting a 'ray-ca-cert' Secret in the namespace") Expect(cli.Delete(ctx, secret)).To(Succeed()) - // Check if 'ray-ca-cert' secret is recreated + // Check if 'ray-ca-cert' Secret is recreated _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("should create a 'ray-tls-scripts' configmap when it is removed manually", func() { + It("should rollback 'ray-ca-cert' Secret in the target ns when it is changed", func() { + By("updating existing 'ray-ca-cert' Secret in the namespace") + rayCACertUpdatedSecret := &corev1.Secret{} + err := convertToStructuredResource(rayCaCertUpdatedPath, rayCACertUpdatedSecret) + Expect(err).NotTo(HaveOccurred()) + rayCACertUpdatedSecret.SetNamespace(testNs) + Expect(cli.Update(ctx, rayCACertUpdatedSecret)).Should(Succeed()) + + // Check if 'ray-ca-cert' Secret is rollback + originalRayCaCertSecret, err := waitForSecret(cli, WorkingNamespace, constants.RayCATlsSecretName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + Eventually(func() bool { + updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 1, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + return compareSecrets(originalRayCaCertSecret, updatedSecretFromTestNs) + }, timeout, interval).Should(BeTrue()) + }) + It("should create a 'ray-tls-scripts' ConfigMap when it is removed manually", func() { configMap := &corev1.ConfigMap{} err := cli.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: testNs}, configMap) Expect(err).NotTo(HaveOccurred()) @@ -123,12 +140,30 @@ var _ = Describe("KServe Ray TLS controller", func() { By("deleting a 'ray-tls-scripts' configMap in the namespace") Expect(cli.Delete(ctx, configMap)).To(Succeed()) - // Check if 'ray-tls-scripts' configmap is recreated + // Check if 'ray-tls-scripts' ConfigMap is recreated _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("should 'ray-tls-scripts' configmap in the namespace when original one updated", func() { - By("updating 'ray-tls-scripts configmap") + It("should rollback 'ray-tls-scripts' ConfigMap in the target ns when it is changed", func() { + By("updating existing 'ray-tls-scripts' ConfigMap in the namespace") + rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsUpdatedConfigMap.SetNamespace(testNs) + Expect(cli.Update(ctx, rayTlsScriptsUpdatedConfigMap)).Should(Succeed()) + + // Check if 'ray-tls-scripts' ConfigMap is rollback + originalRayTlsScriptsConfigMap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() bool { + updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 1, 1*time.Second) + Expect(err).NotTo(HaveOccurred()) + return compareConfigMap(originalRayTlsScriptsConfigMap, updatedConfigMapFromTestNs) + }, timeout, interval).Should(BeTrue()) + }) + It("should 'ray-tls-scripts' ConfigMap in the namespace when original one updated", func() { + By("updating original 'ray-tls-scripts' ConfigMap") rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -138,18 +173,15 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForConfigMap(cli, WorkingNamespace, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - // Check if 'ray-tls-scripts' configmap is updated. + // Check if 'ray-tls-scripts' ConfigMap is updated. Eventually(func() bool { - updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - return compareConfigMap(rayTlsScriptsUpdatedConfigMap, updatedConfigMapFromTestNs) - - }).WithTimeout(30 * time.Second).Should(BeTrue()) - + }, timeout, interval).Should(BeTrue()) }) - It("should update a 'ray-ca-cert' secret in the namespace when original one updated", func() { - By("updating 'ray-ca-cert secret") + It("should update a 'ray-ca-cert' Secret in the namespace when original one updated", func() { + By("updating original 'ray-ca-cert Secret") rayCaCertUpdatedSecret := &corev1.Secret{} err := convertToStructuredResource(rayCaCertUpdatedPath, rayCaCertUpdatedSecret) Expect(err).NotTo(HaveOccurred()) @@ -161,10 +193,10 @@ var _ = Describe("KServe Ray TLS controller", func() { // Check if 'ray-ca-cert' secert is updated. Eventually(func() bool { - updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) return compareSecrets(rayCaCertUpdatedSecret, updatedSecretFromTestNs) - }).WithTimeout(30 * time.Second).Should(BeTrue()) + }, timeout, interval).Should(BeTrue()) }) }) Context("when a multinode ServingRuntime removed", func() { @@ -198,7 +230,7 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray resources should not be removed if there is a multinode ServingRuntime in the namespace", func() { + It("ray tls resources should not be removed if there is a multinode ServingRuntime in the namespace", func() { By("creating another multinode servingruntime for test") // Create another multinode servingruntime multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} @@ -211,21 +243,21 @@ var _ = Describe("KServe Ray TLS controller", func() { By("deleting one multinode servingruntime") Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray resources are NOT removed + // Check if all ray tls resources are NOT removed _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray resources should be removed if there is no multinode ServingRuntime in the namespace", func() { + It("ray tls resources should be removed if there is no multinode ServingRuntime in the namespace", func() { By("deleting a multinode servingruntime") multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) Expect(err).NotTo(HaveOccurred()) - multinodeServingRuntime.SetNamespace(testNs) + multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray resources are removed + // Check if all ray tls resources are removed configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(configmap).To(BeNil()) @@ -233,6 +265,6 @@ var _ = Describe("KServe Ray TLS controller", func() { secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(secret).To(BeNil()) - }) + }) }) }) From d2085fbe9ffaa37787a822f853352d6b18f8363e Mon Sep 17 00:00:00 2001 From: jooho lee Date: Tue, 26 Nov 2024 10:05:40 -0500 Subject: [PATCH 6/8] add more unit tests and cleanup Signed-off-by: jooho lee --- controllers/kserve_ray_tls_controller.go | 102 ++++++++---------- controllers/kserve_ray_tls_controller_test.go | 97 ++++++++++++----- 2 files changed, 113 insertions(+), 86 deletions(-) diff --git a/controllers/kserve_ray_tls_controller.go b/controllers/kserve_ray_tls_controller.go index 0e5f81ad..f8b4bade 100644 --- a/controllers/kserve_ray_tls_controller.go +++ b/controllers/kserve_ray_tls_controller.go @@ -35,17 +35,17 @@ func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRay } // The reconcile logic works as follows: -// ServingRuntime: -// - On creation: The ray-tls-script ConfigMap and ray-ca-cert Secret are created in the respective namespace. -// - On deletion: The ray-tls-script ConfigMap and ray-ca-cert Secret are deleted only when all ServingRuntimes are deleted from the namespace. +// ServingRuntime(multinode): +// - On creation: The ray-tls-script ConfigMap and ray-ca-cert Secret are created in the target namespace. +// - On deletion: The ray-tls-script ConfigMap and ray-ca-cert Secret are deleted only when multinode ServingRuntimes are deleted from the target namespace. // ConfigMap: -// - When the original ConfigMap is updated in the control namespace (ctrl ns): The ray-tls-script ConfigMap is deleted and recreated in the namespace where multinode ServingRuntimes exist. -// - When the ConfigMap is deleted in the target namespace (target ns): The ray-tls-script ConfigMap is regenerated. +// - When the original ConfigMap is updated in the control namespace: The ray-tls-scripts ConfigMap is deleted and recreated in the namespace where multinode ServingRuntimes exist. +// - When the ConfigMap is deleted in the target namespace: The ray-tls-scripts ConfigMap will be recreated. // Secret: -// - When the original Secret is updated in the control namespace (ctrl ns): The ray-ca-cert Secret is deleted and recreated in the namespace where multinode ServingRuntimes exist. -// - When the Secret is deleted in the target namespace (target ns): The ray-ca-cert Secret is regenerated. +// - When the original Secret is updated in the control namespace: The ray-ca-cert Secret is deleted and recreated in the namespace where multinode ServingRuntimes exist. +// - When the Secret is deleted in the target namespace: The ray-ca-cert Secret will be recreated. func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := r.log controllerNs := os.Getenv("POD_NAMESPACE") @@ -53,52 +53,41 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.client.List(ctx, &servingRuntimeList); err != nil { return ctrl.Result{}, err } - noMultiNodeSrExists := !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) + noMultiNodeSrExistInNs := !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList) if req.Name == constants.RayTlsScriptConfigMapName { if req.Namespace == controllerNs { - log.Info("Original Ray TLS scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) + log.Info("Original Ray TLS Scripts ConfigMap is updated", "name", constants.RayTlsScriptConfigMapName, "namespace", req.Namespace) for _, sr := range servingRuntimeList.Items { - // Determine if ServingRuntime matches specific conditions - // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec - // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) - if sr.Name == "vllm-multinode-runtime" { + if isMultiNodeServingRuntime(sr) { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "ConfigMap"); err != nil { return ctrl.Result{}, err } } } } - - if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExists); err != nil { + if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExistInNs); err != nil { return ctrl.Result{}, err } } else if req.Name == constants.RayCATlsSecretName { if req.Namespace == controllerNs { log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) for _, sr := range servingRuntimeList.Items { - // Determine if ServingRuntime matches specific conditions - // TO-DO upstream Kserve 0.15 will have a new API WorkerSpec - // So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) - if sr.Name == "vllm-multinode-runtime" { + if isMultiNodeServingRuntime(sr) { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "Secret"); err != nil { return ctrl.Result{}, err } } } } - - if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExists); err != nil { + if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExistInNs); err != nil { return ctrl.Result{}, err } } else { - // Log and reconcile Ray TLS scripts ConfigMap err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) if err != nil { return ctrl.Result{}, err } - - // Log and reconcile Ray CA Cert Secret err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) if err != nil { return ctrl.Result{}, err @@ -151,15 +140,15 @@ func (r *KServeRayTlsReconciler) SetupWithManager(mgr ctrl.Manager) error { return builder.Complete(r) } -// reconcileRayTlsScriptsConfigMap watch ray-tls-scripts configmap in the controller namespace +// reconcileRayTlsScriptsConfigMap watch ray-tls-scripts configmap in the cluster // and it will create/update/delete ray-tls-scripts configmap in the namespace where multinode ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultinodeSRExistInNs bool) error { +func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultiNodeSrExistInNs bool) error { // When original configmap is updated, it does not need to reconcile if ctrlNs == targetNs { return nil } - log.Info("Reconciling Ray TLS scripts ConfigMap", "name", constants.RayTlsScriptConfigMapName, "namespace", targetNs) + log.Info("Reconciling Ray TLS Scripts ConfigMap", "name", constants.RayTlsScriptConfigMapName, "namespace", targetNs) srcConfigMap := &corev1.ConfigMap{} err := r.client.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: ctrlNs}, srcConfigMap) if err != nil { @@ -184,10 +173,9 @@ func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Con } // Process Delta - if err = r.processDeltaConfigMap(ctx, log, desiredConfigMapResource, existingConfigMapResource, noMultinodeSRExistInNs); err != nil { + if err = r.processDeltaConfigMap(ctx, log, desiredConfigMapResource, existingConfigMapResource, noMultiNodeSrExistInNs); err != nil { return err } - return nil } @@ -206,14 +194,13 @@ func (r *KServeRayTlsReconciler) createDesiredConfigMapResource(destNs string, s }, Data: srcConfigmap.Data, } - return desiredConfigMap, nil } -func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log logr.Logger, desiredConfigMapResource *corev1.ConfigMap, existingConfigMapResource *corev1.ConfigMap, noMultinodeSRExistInNs bool) (err error) { +func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log logr.Logger, desiredConfigMapResource *corev1.ConfigMap, existingConfigMapResource *corev1.ConfigMap, noMultiNodeSrExistInNs bool) (err error) { hasChanged := false - if shouldAddRayConfigMap(existingConfigMapResource, noMultinodeSRExistInNs) { + if shouldAddRayConfigMap(existingConfigMapResource, noMultiNodeSrExistInNs) { hasChanged = true log.V(1).Info("Delta found", "create", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) if err = r.client.Create(ctx, desiredConfigMapResource); err != nil { @@ -231,7 +218,7 @@ func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log } } - if shouldDeleteRayConfigMap(existingConfigMapResource, noMultinodeSRExistInNs) { + if shouldDeleteRayConfigMap(existingConfigMapResource, noMultiNodeSrExistInNs) { hasChanged = true log.V(1).Info("Delta found", "remove", existingConfigMapResource.GetName(), "namespace", existingConfigMapResource.Namespace) if err = r.client.Delete(ctx, existingConfigMapResource); err != nil { @@ -239,27 +226,26 @@ func (r *KServeRayTlsReconciler) processDeltaConfigMap(ctx context.Context, log } } - if !hasChanged && !noMultinodeSRExistInNs { + if !hasChanged && !noMultiNodeSrExistInNs { log.V(1).Info("No delta found", "name", desiredConfigMapResource.GetName(), "namespace", desiredConfigMapResource.Namespace) - return nil } return nil } -func shouldAddRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultinodeSRExistInNs bool) bool { - return !noMultinodeSRExistInNs && utils.IsNil(existingConfigMap) +func shouldAddRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultiNodeSrExistInNs bool) bool { + return !noMultiNodeSrExistInNs && utils.IsNil(existingConfigMap) } func isUpdatedRayConfigMap(desiredConfigMap *corev1.ConfigMap, existingConfigMap *corev1.ConfigMap) bool { return utils.IsNotNil(existingConfigMap) && !reflect.DeepEqual(desiredConfigMap.Data, existingConfigMap.Data) } -func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultinodeSRExistInNs bool) bool { - return utils.IsNotNil(existingConfigMap) && noMultinodeSRExistInNs +func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultiNodeSrExistInNs bool) bool { + return utils.IsNotNil(existingConfigMap) && noMultiNodeSrExistInNs } -// reconcileRayCACertSecret watch ray-ca-cert secret in the controller namespaces +// reconcileRayCACertSecret watch ray-ca-cert secret in the cluster // and it will create/update/delete ray-ca-cert secret in the namespace where multinode ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultinodeSRExistInNs bool) error { +func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultiNodeSrExistInNs bool) error { // When original secret is updated, it does not need to reconcile if ctrlNs == targetNs { return nil @@ -289,7 +275,7 @@ func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, l } // Process Delta - if err = r.processDeltaSecret(ctx, log, desiredSecretResource, existingSecretResource, noMultinodeSRExistInNs); err != nil { + if err = r.processDeltaSecret(ctx, log, desiredSecretResource, existingSecretResource, noMultiNodeSrExistInNs); err != nil { return err } return nil @@ -311,14 +297,13 @@ func (r *KServeRayTlsReconciler) createDesiredSecretResource(destNs string, sour Data: sourceSecret.Data, Type: sourceSecret.Type, } - return desiredSecret, nil } -func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log logr.Logger, desiredSecretResource *corev1.Secret, existingSecretResource *corev1.Secret, noMultinodeSRExistInNs bool) (err error) { +func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log logr.Logger, desiredSecretResource *corev1.Secret, existingSecretResource *corev1.Secret, noMultiNodeSrExistInNs bool) (err error) { hasChanged := false - if shouldAddRaySecret(existingSecretResource, noMultinodeSRExistInNs) { + if shouldAddRaySecret(existingSecretResource, noMultiNodeSrExistInNs) { hasChanged = true log.V(1).Info("Delta found", "create", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) if err = r.client.Create(ctx, desiredSecretResource); err != nil { @@ -336,28 +321,27 @@ func (r *KServeRayTlsReconciler) processDeltaSecret(ctx context.Context, log log } } - if shouldDeletedRaySecret(existingSecretResource, noMultinodeSRExistInNs) { + if shouldDeletedRaySecret(existingSecretResource, noMultiNodeSrExistInNs) { hasChanged = true log.V(1).Info("Delta found", "remove", existingSecretResource.GetName(), "namespace", existingSecretResource.Namespace) if err = r.client.Delete(ctx, existingSecretResource); err != nil { return err } } - if !hasChanged && !noMultinodeSRExistInNs { + if !hasChanged && !noMultiNodeSrExistInNs { log.V(1).Info("No delta found", "name", desiredSecretResource.GetName(), "namespace", desiredSecretResource.Namespace) - return nil } return nil } -func shouldAddRaySecret(existingSecret *corev1.Secret, noMultinodeSRExistInNs bool) bool { - return !noMultinodeSRExistInNs && utils.IsNil(existingSecret) +func shouldAddRaySecret(existingSecret *corev1.Secret, noMultiNodeSrExistInNs bool) bool { + return !noMultiNodeSrExistInNs && utils.IsNil(existingSecret) } func isUpdatedRaySecret(desiredSecret *corev1.Secret, existingSecret *corev1.Secret) bool { return utils.IsNotNil(existingSecret) && !reflect.DeepEqual(desiredSecret.Data, existingSecret.Data) } -func shouldDeletedRaySecret(existingSecret *corev1.Secret, noMultinodeSRExistInNs bool) bool { - return utils.IsNotNil(existingSecret) && noMultinodeSRExistInNs +func shouldDeletedRaySecret(existingSecret *corev1.Secret, noMultiNodeSrExistInNs bool) bool { + return utils.IsNotNil(existingSecret) && noMultiNodeSrExistInNs } func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, log logr.Logger, targetNs string, kind string) error { @@ -403,14 +387,18 @@ func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, return nil } -// Determine if ServingRuntime matches specific conditions -// TO-DO upstream Kserve 0.15 will have a new API WorkerSpec -// So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) func existMultiNodeServingRuntimeInNs(targetNs string, srList kservev1alpha1.ServingRuntimeList) bool { for _, sr := range srList.Items { - if sr.Namespace == targetNs && sr.Name == "vllm-multinode-runtime" { - return true + if sr.Namespace == targetNs { + return isMultiNodeServingRuntime(sr) } } return false } + +// Determine if ServingRuntime matches specific conditions +// TO-DO upstream Kserve 0.15 will have a new API WorkerSpec +// So for now, it will check servingRuntime name, but after we move to 0.15, it needs to check workerSpec is specified or not.(RHOAIENG-16147) +func isMultiNodeServingRuntime(servingRuntime kservev1alpha1.ServingRuntime) bool { + return servingRuntime.Name == "vllm-multinode-runtime" +} diff --git a/controllers/kserve_ray_tls_controller_test.go b/controllers/kserve_ray_tls_controller_test.go index 1dc996a0..78a44f58 100644 --- a/controllers/kserve_ray_tls_controller_test.go +++ b/controllers/kserve_ray_tls_controller_test.go @@ -17,6 +17,7 @@ package controllers import ( "context" + "k8s.io/apimachinery/pkg/types" "time" kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" @@ -24,7 +25,6 @@ import ( . "github.com/onsi/gomega" "github.com/opendatahub-io/odh-model-controller/controllers/constants" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" ) const ( @@ -38,12 +38,48 @@ const ( var _ = Describe("KServe Ray TLS controller", func() { ctx := context.Background() + Context("when a non-multinode ServingRuntime created", func() { + It("should not create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the testNs", func() { + testNamespace := Namespaces.Create(cli) + testNs := testNamespace.Name + + // Create ray tls resources in the ctrl namespace + rayTlsScriptsConfigMap := &corev1.ConfigMap{} + err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) + Expect(err).NotTo(HaveOccurred()) + rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) + + rayCaCertSecret := &corev1.Secret{} + err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) + Expect(err).NotTo(HaveOccurred()) + rayCaCertSecret.SetNamespace(WorkingNamespace) + Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) + + By("creating non-multinode ServingRuntime") + nonMultinodeServingRuntime := &kservev1alpha1.ServingRuntime{} + err = convertToStructuredResource(ServingRuntimePath1, nonMultinodeServingRuntime) + Expect(err).NotTo(HaveOccurred()) + nonMultinodeServingRuntime.SetNamespace(testNs) + Expect(cli.Create(ctx, nonMultinodeServingRuntime)).Should(Succeed()) + + // Check if all ray tls resources are NOT created in the testNs + configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 3, 1*time.Second) + Expect(err).To(HaveOccurred()) + Expect(configmap).To(BeNil()) + + secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 3, 1*time.Second) + Expect(err).To(HaveOccurred()) + Expect(secret).To(BeNil()) + }) + }) + Context("when a multinode ServingRuntime created", func() { - It("should create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the namespace where the SR exist", func() { + It("should create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the testNs where the SR exist", func() { testNamespace := Namespaces.Create(cli) testNs := testNamespace.Name - // Create ray tls resources + // Create ray tls resources in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -63,6 +99,7 @@ var _ = Describe("KServe Ray TLS controller", func() { multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + // Check if all ray tls resources are created in the testNs _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) @@ -74,10 +111,11 @@ var _ = Describe("KServe Ray TLS controller", func() { var testNs string BeforeEach(func() { + // Create a test namespace testNamespace := Namespaces.Create(cli) testNs = testNamespace.Name - // Create ray tls resources + // Create ray tls resources in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -90,25 +128,26 @@ var _ = Describe("KServe Ray TLS controller", func() { rayCaCertSecret.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - // Create a multinode servingruntime + // Create a multinode ServingRuntime multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) Expect(err).NotTo(HaveOccurred()) multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) + // Check if all ray tls resources are created in the testNs _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("should create a 'ray-ca-cert' Secret when it is removed manually", func() { + It("should recreate a 'ray-ca-cert' Secret when it is removed manually", func() { secret := &corev1.Secret{} err := cli.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: testNs}, secret) Expect(err).NotTo(HaveOccurred()) - By("deleting a 'ray-ca-cert' Secret in the namespace") + By("deleting a 'ray-ca-cert' Secret in the testNs") Expect(cli.Delete(ctx, secret)).To(Succeed()) // Check if 'ray-ca-cert' Secret is recreated @@ -116,7 +155,7 @@ var _ = Describe("KServe Ray TLS controller", func() { Expect(err).NotTo(HaveOccurred()) }) It("should rollback 'ray-ca-cert' Secret in the target ns when it is changed", func() { - By("updating existing 'ray-ca-cert' Secret in the namespace") + By("updating existing 'ray-ca-cert' Secret in the testNs") rayCACertUpdatedSecret := &corev1.Secret{} err := convertToStructuredResource(rayCaCertUpdatedPath, rayCACertUpdatedSecret) Expect(err).NotTo(HaveOccurred()) @@ -137,7 +176,7 @@ var _ = Describe("KServe Ray TLS controller", func() { err := cli.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: testNs}, configMap) Expect(err).NotTo(HaveOccurred()) - By("deleting a 'ray-tls-scripts' configMap in the namespace") + By("deleting a 'ray-tls-scripts' configMap in the testNs") Expect(cli.Delete(ctx, configMap)).To(Succeed()) // Check if 'ray-tls-scripts' ConfigMap is recreated @@ -145,7 +184,7 @@ var _ = Describe("KServe Ray TLS controller", func() { Expect(err).NotTo(HaveOccurred()) }) It("should rollback 'ray-tls-scripts' ConfigMap in the target ns when it is changed", func() { - By("updating existing 'ray-tls-scripts' ConfigMap in the namespace") + By("updating existing 'ray-tls-scripts' ConfigMap in the testNs") rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -162,8 +201,8 @@ var _ = Describe("KServe Ray TLS controller", func() { return compareConfigMap(originalRayTlsScriptsConfigMap, updatedConfigMapFromTestNs) }, timeout, interval).Should(BeTrue()) }) - It("should 'ray-tls-scripts' ConfigMap in the namespace when original one updated", func() { - By("updating original 'ray-tls-scripts' ConfigMap") + It("should 'ray-tls-scripts' ConfigMap in the testNs when original one in the ctrlNs updated", func() { + By("updating original 'ray-tls-scripts' ConfigMap in the ctrlNs") rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -173,15 +212,15 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForConfigMap(cli, WorkingNamespace, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - // Check if 'ray-tls-scripts' ConfigMap is updated. + // Check if 'ray-tls-scripts' ConfigMap in the testNs is updated. Eventually(func() bool { updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) return compareConfigMap(rayTlsScriptsUpdatedConfigMap, updatedConfigMapFromTestNs) - }, timeout, interval).Should(BeTrue()) + }, timeout, interval).Should(BeTrue()) }) - It("should update a 'ray-ca-cert' Secret in the namespace when original one updated", func() { - By("updating original 'ray-ca-cert Secret") + It("should update a 'ray-ca-cert' Secret in the testNs when original one in the ctrlNs updated", func() { + By("updating original 'ray-ca-cert Secret in the ctrlNs") rayCaCertUpdatedSecret := &corev1.Secret{} err := convertToStructuredResource(rayCaCertUpdatedPath, rayCaCertUpdatedSecret) Expect(err).NotTo(HaveOccurred()) @@ -191,21 +230,21 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForSecret(cli, WorkingNamespace, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - // Check if 'ray-ca-cert' secert is updated. + // Check if 'ray-ca-cert' Secret in the testNs is updated. Eventually(func() bool { updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) return compareSecrets(rayCaCertUpdatedSecret, updatedSecretFromTestNs) - }, timeout, interval).Should(BeTrue()) + }, timeout, interval).Should(BeTrue()) }) }) - Context("when a multinode ServingRuntime removed", func() { + Context("when a multinode ServingRuntime removed from the testNs", func() { var testNs string BeforeEach(func() { testNamespace := Namespaces.Create(cli) testNs = testNamespace.Name - // Create ray tls resources + // Create ray tls resources in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -218,7 +257,7 @@ var _ = Describe("KServe Ray TLS controller", func() { rayCaCertSecret.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - // Create a multinode servingruntime + // Create a multinode ServingRuntime in the testNs multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) Expect(err).NotTo(HaveOccurred()) @@ -230,9 +269,9 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray tls resources should not be removed if there is a multinode ServingRuntime in the namespace", func() { - By("creating another multinode servingruntime for test") - // Create another multinode servingruntime + It("ray tls resources should not be removed if there is a multinode ServingRuntime in the testNs", func() { + By("creating another multinode ServingRuntime in the testNs") + // Create another multinode ServingRuntime multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) Expect(err).NotTo(HaveOccurred()) @@ -240,7 +279,7 @@ var _ = Describe("KServe Ray TLS controller", func() { multinodeServingRuntime.SetName("another-multinode-servingruntime") Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) - By("deleting one multinode servingruntime") + By("deleting one multinode ServingRuntime in the testNs") Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) // Check if all ray tls resources are NOT removed @@ -249,8 +288,8 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray tls resources should be removed if there is no multinode ServingRuntime in the namespace", func() { - By("deleting a multinode servingruntime") + It("ray tls resources should be removed if there is no multinode ServingRuntime in the testNs", func() { + By("deleting a multinode ServingRuntime") multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) Expect(err).NotTo(HaveOccurred()) @@ -258,11 +297,11 @@ var _ = Describe("KServe Ray TLS controller", func() { Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) // Check if all ray tls resources are removed - configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) + configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(configmap).To(BeNil()) - secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(secret).To(BeNil()) }) From 797907de98a1efdb97c54bb3b4524b664db8c41f Mon Sep 17 00:00:00 2001 From: jooho lee Date: Mon, 2 Dec 2024 15:32:21 -0500 Subject: [PATCH 7/8] remove public ca and generate ca by operator Signed-off-by: jooho lee --- config/overlays/odh/kustomization.yaml | 2 +- config/overlays/odh/ray_tls_resources.yaml | 83 --------- config/overlays/odh/ray_tls_script.yaml | 69 +++++++ config/runtimes/vllm-multinode-template.yaml | 14 +- controllers/constants/constants.go | 4 +- controllers/kserve_ray_tls_controller.go | 50 ++--- controllers/kserve_ray_tls_controller_test.go | 102 ++++------- ...dated.yaml => ray-tls-script-updated.yaml} | 4 +- ...y-tls-scripts.yaml => ray-tls-script.yaml} | 4 +- .../deploy/vllm-multinode-servingruntime.yaml | 8 +- .../testdata/secrets/ray-ca-cert-updated.yaml | 9 +- controllers/testdata/secrets/ray-ca-cert.yaml | 13 -- controllers/utils/cert.go | 171 ++++++++++++++++++ 13 files changed, 332 insertions(+), 201 deletions(-) delete mode 100644 config/overlays/odh/ray_tls_resources.yaml create mode 100644 config/overlays/odh/ray_tls_script.yaml rename controllers/testdata/configmaps/{ray-tls-scripts-updated.yaml => ray-tls-script-updated.yaml} (94%) rename controllers/testdata/configmaps/{ray-tls-scripts.yaml => ray-tls-script.yaml} (94%) delete mode 100644 controllers/testdata/secrets/ray-ca-cert.yaml create mode 100644 controllers/utils/cert.go diff --git a/config/overlays/odh/kustomization.yaml b/config/overlays/odh/kustomization.yaml index a7ee7169..20826ef7 100644 --- a/config/overlays/odh/kustomization.yaml +++ b/config/overlays/odh/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ../../default - - ./ray_tls_resources.yaml + - ./ray_tls_script.yaml patches: - path: odh_model_controller_manager_patch.yaml diff --git a/config/overlays/odh/ray_tls_resources.yaml b/config/overlays/odh/ray_tls_resources.yaml deleted file mode 100644 index 02b6c261..00000000 --- a/config/overlays/odh/ray_tls_resources.yaml +++ /dev/null @@ -1,83 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: ray-ca-cert - labels: - opendatahub.io/managed: 'true' -data: - # output from cat ca.crt | base64 - ca.crt: | - LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - # output from cat ca.key | base64 - ca.key: | - LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUpRd0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQ1Mwd2dna3BBZ0VBQW9JQ0FRQy95SW1GbVpodUFqWS8KZjkrY3hBV3NqSnY0SE1ZU0J6L29FdkNqZEw2RjFvdnJ6Q2NGdzBIY0RnVGJSV1pHTGMyU0xLQXg3aHM3WU93MQplSzNyemVEU1BMU1RZT3FHMzVDbEtYZThjS09MQjdyTk1COCtYcVhaTHRQREZ1YkhWVFQwa05oNGxjVzQyVmlhClVHU0duOWdIU1pHWlQxcGs5R1NtVHJVQSs0OWlPSnQ3Z2NabkhQRXVZVytpV1dxREpYRkZNWGVOWmFaK3pwbTkKR2xlQUQzMjJQNkhQcVgzZ3JON3FaNm81V0kzaFFlTHhPY1dER0ZnRE0vaUE0Y1JPS2cza1lyQ3djV3UyUDZVMAphdXZYbWxMTXdmY3NRK3hnS3lOcFIvS3ovbE1ZeFdpTHZwQUZlWnArL1BjSHMvTFMrUmVBUG1VUGFYUFRFZWtLClozWFk4bVJNUDZXWGJ1OUZZKzJBT2RUcmlvOUt0YjdaZThEenFkRTdNcVQvR2Jwdkh4VWVKeVozRG9WeXpIc0oKV2pKUU1WV1VIVzlBTXJRT2pDTGJWbGlkUGY2US9YOHlSbkJiemY1L1lPbFlDZmV0MnAzWVN5OW0wUzdCQ1JaVQp6QTd0aDhwZ2VnUHFCWHFlM0hyS3ZTU0ExM1ZXb3hyS3RvZm5tVHVJS21KNEYzSHdHTW5pSVhQVitxb1RuZjFZCm1IK3NoWWZkNTBiVnkwRExnVU8ydVdOYkJFc0E4OEdHeWJTcC9ySktZZlJ5Z2RlMUJCRFVxVE9LMGVNS3N3ejgKS1NYMk1wTDVHOXNKZURRZ2FJM0lqbTZldEZud2d2a0t0WmVDZkZNMGZDYUdXbXdnZC9aVkthU1pEbytVS202dApOQ1lnTEZ3andXY1BQb0FicmFHd0ZDdUJTSTlIandJREFRQUJBb0lDQUFyYzkwaG9ud3VIWGI3ZmNtU0IxU3JZClZPWWt1WDl6aHQvRWxIb1E5cDNFSSswNWhWaFdCTmpMNjBvYXRuRlhtenk3emZtTWMyRTcyemlPam1OdmpvOGcKY1l4eDlMYmQycG5RWUlBWEJ0eDV5UUxJWUFaSUwySys3NjloRUlLYksvVzQxZG9wN05vekFMQm9MMW1FenlSZgpWS0hFU0ZDMHptS3hNOUpMYllYeWowMm9QbUhBY0NHdGJHdjFrZGZ4RkdjNldrZy80c0tnY05ld3NueUdTb0lICm8zd21ZSnkvSjUxTDF5QlhPL2J2Y1hobHNMd3djamNCQ0FNUUU0aE42UjJKUUwrdDBEWGt2SjBQcnZzRE9wa0kKakdzTlEzMWVPcEpERmdwL21zNlFNWnpObHhwdXNGQTVnNUNkaUpRMHNkSGpOdUtqTXhyeUxKRk1HY0l0OExEQwpRVzF2akxLR0l1UWtraGwxOWU1S1N2SDdjUjJja0pDME5vTzhnekpudzd0dTRGaHJaK0xQeXF2R3VSYU55a2RmCi9BKzNEOUE2RW1PNWRldFU2RzJkK0l2TmprdG91Z05UalZIUklDbk9oL01zRmlFQXdycHltVVNISzhKTjVpSjIKUm1rNFljNWlXUjhOUWs0Wkh6aVFGSHJSWkh0TW9DcEkvR2ZGcnYyRVE0bFpOOG5tZHdDWDR4a3JObUJ2ZnlIdgpLWW0yMU5VWDc5U3lRbHd5VS9lNUR6eTA3Si9zcTdoVU8xN3hVcXNzTVpZTGZCSFF0VFU0VVAwbnBOZmtxUFM1CjdJRUtIVWwyRlZudXR0THoxc0ZVVHhJTS90aE9lczRtWElrOExiYzI0Yk45VWlteXplVEN1bE83a0hZSDhTVkEKZDJqZFBTZXhZSTdMeWFVNnFHRzVBb0lCQVFEbHlVQk5CaTRNekdxVnh5NjNCY1dyZC9rdVYrYTFLQ3MyYVhzagpLbVlMT0xrSkhUSjI0YU9EWkJBVGxEMllwclZEOUM1UThTeDdPQVdlQ3FqWHd1MndsOTlabXNIVkxiQTZMRUZ6CnBoYTNQVHhkaWFpMElwZVY2ZFpIQnQrdjVDVGsvSnpxSVpjc1J2REFnNTFHYzgxbERxTzFNbnVqbldBcGpSMmMKd05ZVXd6a3hicHVTc1ZHYzFBZ09tVHBHN1MrdWVVQ2FGU0NVaGkyVWVoblM5dkNrU3Y0QTZMRlpiaXhFeWp6aApycU9mN1d1TTVUWkFoTGM2RTJUQnVOeWJlWW9DblFMdHF3dnUxaFhDOGU4TGlQWFRlMVJ4U2x5dXA5RDhiWEZBCjVPVmFUZjAzcFFweURiOXNKeGhLN3FMbUgrSjlUeU5JamhTTUZQT2pKNEJFRTlPOUFvSUJBUURWcVhDMGdCVzUKYlNUWmUzc3l1QVltRi9hVDg1ZFh1NGFTMFBJR09MakE1M2h0RVdLUkJxd1JlU1prSFdtR05uNUIyOXVXTHg2UgpPZjFNOFJkY2NYSnlxMnp1TlBiWkpabllwS0x5N0FjeDBpc1RvMjdpUy9xRS85YndsNUo3QVU5UmZ2K2ZMK2RPCmxqUndRTGUvQ1dSVHVlTlNOSWpPUC96NWRra2J5Z1kvWHZHbmI0RUJheDY4K3J2a0NYbStGdFpXV3VoblM2Uy8KZHh3Ulo2VGRMd09RZTZQSzNzN3F4c2xWNmQ2dmwrSUpwa1VVZmRvWDNyWFlTeGx2cFlQYWJpWEpaVzdQWkZwRQpVQXc0VTFpSzVLMUt5d1ZjaHlhN2tQSlpRNUplS1pUT3lPL1d5ODZLak0vcUd3NUhDR2NOL2VMbDJKUUViUkwvClJiR0pGSmhUalpjN0FvSUJBQXlyNG0zYzcyRXBUSjloMG9PcFA5TksxR1RuMkFNWmFmaWdMSGd0K0Y2YURDb2kKZ0F2cU9YZ2ZabnVONnkrbDBjMGpoQUpXcWx0SkpaWW5oRlFSbmNYbE9oM1kyT09HbDNjOXhZWTVISHVTVnVmWgpsWUlKZms1NERLYnlEQmZJL3ZmWnJsV0M4TEV5WUVoZGVhak83ZjZxcGdCeC9qdHhqRUgrVkNtMndKZDRoSWpqClRwVHlUa3ZWclhRUW94UVNORlRzdnRGQVpRR0x2S3U1Wi84b092RDBhYmxuRzVDUThNUUNXd1VlK2tyeGJzTGcKU1BPWjNmakg1UUNCenppTHBUNnJwZU94VVFFa3NTS0U4T2V6NzhwdnZLSmF0VzIwTjJRVUxQQ2xMcmlpSUZxWApNVkpFeTgrTkFGdnhlTzR6eCt1ZEY1Y0Nyc05pekdTczR2ZmVHQWtDZ2dFQkFMdFRnbWdPd0gxQlR3U0t1Ym4vCkZBMEVCNEV5R2FlbTExY1RjSTY1M21ucXgyL0F4VTFucnlibXRCMGttR2Mra2JYR1FDRE5rUnc4M25NK0VZQlEKU3NwMHQ5MmxmQ05vVHhsZFJ5eDZlZGhaYnNFYUVsYS96SllkQk9NTjBUU2RNbUMrV3ZuRGN5WTRsU014NnFmSQpZVGp6Q25ZQmIweDlWNXVUOUljenVnU0hocEdKTm03Njd3azdQODZ2N0JnWVI3V1FvS0FuOXZxVFFIMldCRHFVClJLakJiaHFvL0h0azdCS3lLRGFGa0gxclZMZWhtN3cvMitrVjl1Z25FcEpJN2tKRDkwSkh0c2liOGdyVU1CWWUKWmp6a0FRQmQwanl5MlhnZndVMWpZWDluTnJoNUdjM3BwVVNZa2d6L05mTlRmRUtPZnovZUxjQzM1dTdMcXIzZQpydzhDZ2dFQkFMT2tsTkJNRVBmM20yTXBjaVRjRmNKb08vZzBMUUpHaTJtWkN6S1g3eDJFS0N2N1ZvVWVtRkk0CjRkRFVmSlBJWlBFTUpkTHRSUy9qUDEyZWkxek9lWHIrVGlUTklpUUVoemRtL0RZWUdjd2hyb0xLNDZVTFJKY0YKYzdxZ2xNQ1Z1MW9DTmtDdTJvZ08renczRm9makJzK1pqcE1BS2kyOTZ2ZDk2YVlYNThYR0RKekdmdjhuZEF1dwpEUmU1ZE5oQU5iaHZqSlM1VXJwNnhoMVMycTNYOHorTlFGWW9CNDM1Q2NXNW50WWMzemIxYVdzY0NxMWJsUGJGCjc0QTFLTHJNNlpvU0ZlcUVWZzhvajhpWjlDaitiTTJXYm9BREIvRTROM0kyNmFDK1dDRWxtdTd3ZDdQaExQT2IKN3RrTXh2Zm10dDE5T2dYbTRKZm9SZWlkMTNYbHFoZz0KLS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQo= ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: ray-tls-scripts - labels: - opendatahub.io/managed: 'true' -data: - gencert_ray.sh: | - #!/bin/sh - ## Create tls.key - openssl genrsa -out /etc/ray/tls/tls.key 2048 - - ## Write CSR Config - cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl - - ## Generate tls.cert - openssl x509 -req \ - -in /etc/ray/tls/ca.csr \ - -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ - -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ - -days 36500 \ - -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/config/overlays/odh/ray_tls_script.yaml b/config/overlays/odh/ray_tls_script.yaml new file mode 100644 index 00000000..e439097b --- /dev/null +++ b/config/overlays/odh/ray_tls_script.yaml @@ -0,0 +1,69 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-tls-script + labels: + opendatahub.io/managed: 'true' +data: + gencert_ray.sh: | + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /tmp/ca.srl + + ## Generate tls.cert + openssl x509 -req \ + -in /etc/ray/tls/ca.csr \ + -CA /etc/ca/tls/tls.crt -CAkey /etc/ca/tls/tls.key \ + -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ + -days 3650 \ + -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/config/runtimes/vllm-multinode-template.yaml b/config/runtimes/vllm-multinode-template.yaml index eb744173..ac269006 100644 --- a/config/runtimes/vllm-multinode-template.yaml +++ b/config/runtimes/vllm-multinode-template.yaml @@ -41,7 +41,8 @@ objects: - | # Generate self signed certificate if [[ $RAY_USE_TLS == "1" ]]; then - /etc/gen/tls/gencert_ray.sh + echo "Generating Self Signed Certificate for Ray nodes" + /etc/gen/tls/gencert_ray.sh > /dev/null 2>&1 fi ray start --head --disable-usage-stats --include-dashboard false @@ -64,7 +65,7 @@ objects: - name: RAY_TLS_SERVER_KEY value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: '/etc/ca/tls/ca.crt' + value: '/etc/ca/tls/tls.crt' - name: RAY_PORT value: '6379' - name: RAY_ADDRESS @@ -209,7 +210,7 @@ objects: # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: ray-tls-scripts + name: ray-tls-script defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: @@ -226,7 +227,8 @@ objects: - | # Generate self signed certificate if [[ $RAY_USE_TLS == "1" ]]; then - /etc/gen/tls/gencert_ray.sh + echo "Generating Self Signed Certificate for Ray nodes" + /etc/gen/tls/gencert_ray.sh > /dev/null 2>&1 fi SECONDS=0 @@ -260,7 +262,7 @@ objects: - name: RAY_TLS_SERVER_KEY value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: '/etc/ca/tls/ca.crt' + value: '/etc/ca/tls/tls.crt' - name: POD_NAME valueFrom: fieldRef: @@ -346,7 +348,7 @@ objects: # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: ray-tls-scripts + name: ray-tls-script defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: diff --git a/controllers/constants/constants.go b/controllers/constants/constants.go index 46c2c324..c454dbe1 100644 --- a/controllers/constants/constants.go +++ b/controllers/constants/constants.go @@ -82,6 +82,6 @@ const ( // Ray const ( - RayCATlsSecretName = "ray-ca-cert" - RayTlsScriptConfigMapName = "ray-tls-scripts" + RayCASecretName = "ray-ca-cert" + RayTlsScriptConfigMapName = "ray-tls-script" ) diff --git a/controllers/kserve_ray_tls_controller.go b/controllers/kserve_ray_tls_controller.go index f8b4bade..a06b5226 100644 --- a/controllers/kserve_ray_tls_controller.go +++ b/controllers/kserve_ray_tls_controller.go @@ -40,8 +40,8 @@ func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRay // - On deletion: The ray-tls-script ConfigMap and ray-ca-cert Secret are deleted only when multinode ServingRuntimes are deleted from the target namespace. // ConfigMap: -// - When the original ConfigMap is updated in the control namespace: The ray-tls-scripts ConfigMap is deleted and recreated in the namespace where multinode ServingRuntimes exist. -// - When the ConfigMap is deleted in the target namespace: The ray-tls-scripts ConfigMap will be recreated. +// - When the original ConfigMap is updated in the control namespace: The ray-tls-script ConfigMap is deleted and recreated in the namespace where multinode ServingRuntimes exist. +// - When the ConfigMap is deleted in the target namespace: The ray-tls-script ConfigMap will be recreated. // Secret: // - When the original Secret is updated in the control namespace: The ray-ca-cert Secret is deleted and recreated in the namespace where multinode ServingRuntimes exist. @@ -49,6 +49,19 @@ func NewKServeRayTlsReconciler(client client.Client, log logr.Logger) *KServeRay func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := r.log controllerNs := os.Getenv("POD_NAMESPACE") + + srcSecret := &corev1.Secret{} + err := r.client.Get(ctx, types.NamespacedName{Name: constants.RayCASecretName, Namespace: controllerNs}, srcSecret) + if err != nil { + if apierrs.IsNotFound(err) { + createErr := utils.CreateSelfSignedCertificate(ctx, r.client, constants.RayCASecretName, "Ray Self Signed Certs", controllerNs) + if createErr != nil { + return ctrl.Result{}, createErr + } + } else { + return ctrl.Result{}, err + } + } var servingRuntimeList kservev1alpha1.ServingRuntimeList if err := r.client.List(ctx, &servingRuntimeList); err != nil { return ctrl.Result{}, err @@ -69,9 +82,9 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.reconcileRayTlsScriptsConfigMap(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExistInNs); err != nil { return ctrl.Result{}, err } - } else if req.Name == constants.RayCATlsSecretName { + } else if req.Name == constants.RayCASecretName { if req.Namespace == controllerNs { - log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCATlsSecretName, "namespace", req.Namespace) + log.Info("Original Ray CA Cert Secret is updated", "name", constants.RayCASecretName, "namespace", req.Namespace) for _, sr := range servingRuntimeList.Items { if isMultiNodeServingRuntime(sr) { if err := r.cleanupRayResourcesByKind(ctx, log, sr.Namespace, "Secret"); err != nil { @@ -80,7 +93,7 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request } } } - if err := r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, noMultiNodeSrExistInNs); err != nil { + if err := r.reconcileRayCACertSecret(ctx, log, srcSecret, controllerNs, req.Namespace, noMultiNodeSrExistInNs); err != nil { return ctrl.Result{}, err } } else { @@ -88,7 +101,7 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request if err != nil { return ctrl.Result{}, err } - err = r.reconcileRayCACertSecret(ctx, log, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) + err = r.reconcileRayCACertSecret(ctx, log, srcSecret, controllerNs, req.Namespace, !existMultiNodeServingRuntimeInNs(req.Namespace, servingRuntimeList)) if err != nil { return ctrl.Result{}, err } @@ -97,10 +110,10 @@ func (r *KServeRayTlsReconciler) Reconcile(ctx context.Context, req ctrl.Request } func checkRayTLSResource(objectName string) bool { - return objectName == constants.RayCATlsSecretName || objectName == constants.RayTlsScriptConfigMapName + return objectName == constants.RayCASecretName || objectName == constants.RayTlsScriptConfigMapName } -// reconcileRayTLSResource filters out ConfigMaps and Secrets that do not match the predefined constants: RayCATlsSecretName or RayTlsScriptConfigMapName. +// reconcileRayTLSResource filters out ConfigMaps and Secrets that do not match the predefined constants: RayCASecretName or RayTlsScriptConfigMapName. // This ensures that only the relevant ConfigMaps and Secrets for Ray TLS configuration are captured and processed for the servingRuntime. func reconcileRayTLSResource() predicate.Predicate { return predicate.Funcs{ @@ -140,8 +153,8 @@ func (r *KServeRayTlsReconciler) SetupWithManager(mgr ctrl.Manager) error { return builder.Complete(r) } -// reconcileRayTlsScriptsConfigMap watch ray-tls-scripts configmap in the cluster -// and it will create/update/delete ray-tls-scripts configmap in the namespace where multinode ServingRuntime created +// reconcileRayTlsScriptsConfigMap watch ray-tls-script configmap in the cluster +// and it will create/update/delete ray-tls-script configmap in the namespace where multinode ServingRuntime created func (r *KServeRayTlsReconciler) reconcileRayTlsScriptsConfigMap(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultiNodeSrExistInNs bool) error { // When original configmap is updated, it does not need to reconcile if ctrlNs == targetNs { @@ -245,18 +258,13 @@ func shouldDeleteRayConfigMap(existingConfigMap *corev1.ConfigMap, noMultiNodeSr // reconcileRayCACertSecret watch ray-ca-cert secret in the cluster // and it will create/update/delete ray-ca-cert secret in the namespace where multinode ServingRuntime created -func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, ctrlNs string, targetNs string, noMultiNodeSrExistInNs bool) error { +func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, log logr.Logger, srcSecret *corev1.Secret, ctrlNs string, targetNs string, noMultiNodeSrExistInNs bool) error { // When original secret is updated, it does not need to reconcile if ctrlNs == targetNs { return nil } - log.Info("Reconciling Ray CA Cert Secret", "name", constants.RayCATlsSecretName, "namespace", targetNs) - srcSecret := &corev1.Secret{} - err := r.client.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: ctrlNs}, srcSecret) - if err != nil { - return err - } + log.Info("Reconciling Ray CA Cert Secret", "name", constants.RayCASecretName, "namespace", targetNs) // Create Desired resource desiredSecretResource, err := r.createDesiredSecretResource(targetNs, srcSecret) if err != nil { @@ -265,7 +273,7 @@ func (r *KServeRayTlsReconciler) reconcileRayCACertSecret(ctx context.Context, l // Get Existing resource existingSecretResource := &corev1.Secret{} - err = r.client.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: targetNs}, existingSecretResource) + err = r.client.Get(ctx, types.NamespacedName{Name: constants.RayCASecretName, Namespace: targetNs}, existingSecretResource) if err != nil { if apierrs.IsNotFound(err) { existingSecretResource = nil @@ -368,17 +376,17 @@ func (r *KServeRayTlsReconciler) cleanupRayResourcesByKind(ctx context.Context, if kind == "Secret" { secret := &corev1.Secret{} err := r.client.Get(ctx, types.NamespacedName{ - Name: constants.RayCATlsSecretName, + Name: constants.RayCASecretName, Namespace: targetNs, }, secret) if err != nil { if apierrs.IsNotFound(err) { - log.Info("Secret not found, skipping", "name", constants.RayCATlsSecretName, "namespace", targetNs) + log.Info("Secret not found, skipping", "name", constants.RayCASecretName, "namespace", targetNs) } return err } - log.Info("Deleting Secret", "name", constants.RayCATlsSecretName, "namespace", targetNs) + log.Info("Deleting Secret", "name", constants.RayCASecretName, "namespace", targetNs) err = r.client.Delete(ctx, secret) if err != nil { return err diff --git a/controllers/kserve_ray_tls_controller_test.go b/controllers/kserve_ray_tls_controller_test.go index 78a44f58..14950254 100644 --- a/controllers/kserve_ray_tls_controller_test.go +++ b/controllers/kserve_ray_tls_controller_test.go @@ -17,9 +17,10 @@ package controllers import ( "context" - "k8s.io/apimachinery/pkg/types" "time" + "k8s.io/apimachinery/pkg/types" + kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -29,9 +30,8 @@ import ( const ( multinodeServingRuntimePath = "./testdata/deploy/vllm-multinode-servingruntime.yaml" - rayTlsScriptsPath = "./testdata/configmaps/ray-tls-scripts.yaml" - rayTlsScriptsUpdatedPath = "./testdata/configmaps/ray-tls-scripts-updated.yaml" - rayCaCertPath = "./testdata/secrets/ray-ca-cert.yaml" + rayTlsScriptsPath = "./testdata/configmaps/ray-tls-script.yaml" + rayTlsScriptsUpdatedPath = "./testdata/configmaps/ray-tls-script-updated.yaml" rayCaCertUpdatedPath = "./testdata/secrets/ray-ca-cert-updated.yaml" ) @@ -39,23 +39,17 @@ var _ = Describe("KServe Ray TLS controller", func() { ctx := context.Background() Context("when a non-multinode ServingRuntime created", func() { - It("should not create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the testNs", func() { + It("should not create a 'ray-ca-cert' Secret and 'ray-tls-script' ConfigMap in the testNs", func() { testNamespace := Namespaces.Create(cli) testNs := testNamespace.Name - // Create ray tls resources in the ctrl namespace + // Create ray tls script in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) - rayCaCertSecret := &corev1.Secret{} - err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) - Expect(err).NotTo(HaveOccurred()) - rayCaCertSecret.SetNamespace(WorkingNamespace) - Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - By("creating non-multinode ServingRuntime") nonMultinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(ServingRuntimePath1, nonMultinodeServingRuntime) @@ -63,35 +57,29 @@ var _ = Describe("KServe Ray TLS controller", func() { nonMultinodeServingRuntime.SetNamespace(testNs) Expect(cli.Create(ctx, nonMultinodeServingRuntime)).Should(Succeed()) - // Check if all ray tls resources are NOT created in the testNs + // Check if all ray tls script are NOT created in the testNs configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(configmap).To(BeNil()) - secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 3, 1*time.Second) + secret, err := waitForSecret(cli, testNs, constants.RayCASecretName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(secret).To(BeNil()) }) }) Context("when a multinode ServingRuntime created", func() { - It("should create a 'ray-ca-cert' Secret and 'ray-tls-scripts' ConfigMap in the testNs where the SR exist", func() { + It("should create a 'ray-ca-cert' Secret and 'ray-tls-script' ConfigMap in the testNs where the SR exist", func() { testNamespace := Namespaces.Create(cli) testNs := testNamespace.Name - // Create ray tls resources in the ctrl namespace + // Create ray tls script in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) - rayCaCertSecret := &corev1.Secret{} - err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) - Expect(err).NotTo(HaveOccurred()) - rayCaCertSecret.SetNamespace(WorkingNamespace) - Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - By("creating multinode ServingRuntime") multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) @@ -99,10 +87,10 @@ var _ = Describe("KServe Ray TLS controller", func() { multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray tls resources are created in the testNs + // Check if all ray tls script are created in the testNs _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, testNs, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) }) @@ -115,19 +103,13 @@ var _ = Describe("KServe Ray TLS controller", func() { testNamespace := Namespaces.Create(cli) testNs = testNamespace.Name - // Create ray tls resources in the ctrl namespace + // Create ray tls script in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) - rayCaCertSecret := &corev1.Secret{} - err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) - Expect(err).NotTo(HaveOccurred()) - rayCaCertSecret.SetNamespace(WorkingNamespace) - Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - // Create a multinode ServingRuntime multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) @@ -135,23 +117,23 @@ var _ = Describe("KServe Ray TLS controller", func() { multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Create(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray tls resources are created in the testNs + // Check if all ray tls script are created in the testNs _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, testNs, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) It("should recreate a 'ray-ca-cert' Secret when it is removed manually", func() { secret := &corev1.Secret{} - err := cli.Get(ctx, types.NamespacedName{Name: constants.RayCATlsSecretName, Namespace: testNs}, secret) + err := cli.Get(ctx, types.NamespacedName{Name: constants.RayCASecretName, Namespace: testNs}, secret) Expect(err).NotTo(HaveOccurred()) By("deleting a 'ray-ca-cert' Secret in the testNs") Expect(cli.Delete(ctx, secret)).To(Succeed()) // Check if 'ray-ca-cert' Secret is recreated - _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, testNs, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) It("should rollback 'ray-ca-cert' Secret in the target ns when it is changed", func() { @@ -163,35 +145,35 @@ var _ = Describe("KServe Ray TLS controller", func() { Expect(cli.Update(ctx, rayCACertUpdatedSecret)).Should(Succeed()) // Check if 'ray-ca-cert' Secret is rollback - originalRayCaCertSecret, err := waitForSecret(cli, WorkingNamespace, constants.RayCATlsSecretName, 30, 1*time.Second) + originalRayCaCertSecret, err := waitForSecret(cli, WorkingNamespace, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) Eventually(func() bool { - updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 1, 1*time.Second) + updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCASecretName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) return compareSecrets(originalRayCaCertSecret, updatedSecretFromTestNs) }, timeout, interval).Should(BeTrue()) }) - It("should create a 'ray-tls-scripts' ConfigMap when it is removed manually", func() { + It("should create a 'ray-tls-script' ConfigMap when it is removed manually", func() { configMap := &corev1.ConfigMap{} err := cli.Get(ctx, types.NamespacedName{Name: constants.RayTlsScriptConfigMapName, Namespace: testNs}, configMap) Expect(err).NotTo(HaveOccurred()) - By("deleting a 'ray-tls-scripts' configMap in the testNs") + By("deleting a 'ray-tls-script' configMap in the testNs") Expect(cli.Delete(ctx, configMap)).To(Succeed()) - // Check if 'ray-tls-scripts' ConfigMap is recreated + // Check if 'ray-tls-script' ConfigMap is recreated _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("should rollback 'ray-tls-scripts' ConfigMap in the target ns when it is changed", func() { - By("updating existing 'ray-tls-scripts' ConfigMap in the testNs") + It("should rollback 'ray-tls-script' ConfigMap in the target ns when it is changed", func() { + By("updating existing 'ray-tls-script' ConfigMap in the testNs") rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsUpdatedConfigMap.SetNamespace(testNs) Expect(cli.Update(ctx, rayTlsScriptsUpdatedConfigMap)).Should(Succeed()) - // Check if 'ray-tls-scripts' ConfigMap is rollback + // Check if 'ray-tls-script' ConfigMap is rollback originalRayTlsScriptsConfigMap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) @@ -201,8 +183,8 @@ var _ = Describe("KServe Ray TLS controller", func() { return compareConfigMap(originalRayTlsScriptsConfigMap, updatedConfigMapFromTestNs) }, timeout, interval).Should(BeTrue()) }) - It("should 'ray-tls-scripts' ConfigMap in the testNs when original one in the ctrlNs updated", func() { - By("updating original 'ray-tls-scripts' ConfigMap in the ctrlNs") + It("should 'ray-tls-script' ConfigMap in the testNs when original one in the ctrlNs updated", func() { + By("updating original 'ray-tls-script' ConfigMap in the ctrlNs") rayTlsScriptsUpdatedConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsUpdatedPath, rayTlsScriptsUpdatedConfigMap) Expect(err).NotTo(HaveOccurred()) @@ -212,7 +194,7 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForConfigMap(cli, WorkingNamespace, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - // Check if 'ray-tls-scripts' ConfigMap in the testNs is updated. + // Check if 'ray-tls-script' ConfigMap in the testNs is updated. Eventually(func() bool { updatedConfigMapFromTestNs, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) @@ -220,19 +202,19 @@ var _ = Describe("KServe Ray TLS controller", func() { }, timeout, interval).Should(BeTrue()) }) It("should update a 'ray-ca-cert' Secret in the testNs when original one in the ctrlNs updated", func() { - By("updating original 'ray-ca-cert Secret in the ctrlNs") + By("updating original 'ray-ca-cert' Secret in the ctrlNs") rayCaCertUpdatedSecret := &corev1.Secret{} err := convertToStructuredResource(rayCaCertUpdatedPath, rayCaCertUpdatedSecret) Expect(err).NotTo(HaveOccurred()) rayCaCertUpdatedSecret.SetNamespace(WorkingNamespace) Expect(cli.Update(ctx, rayCaCertUpdatedSecret)).Should(Succeed()) - _, err = waitForSecret(cli, WorkingNamespace, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, WorkingNamespace, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) // Check if 'ray-ca-cert' Secret in the testNs is updated. Eventually(func() bool { - updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 1, 1*time.Second) + updatedSecretFromTestNs, err := waitForSecret(cli, testNs, constants.RayCASecretName, 1, 1*time.Second) Expect(err).NotTo(HaveOccurred()) return compareSecrets(rayCaCertUpdatedSecret, updatedSecretFromTestNs) }, timeout, interval).Should(BeTrue()) @@ -244,19 +226,13 @@ var _ = Describe("KServe Ray TLS controller", func() { testNamespace := Namespaces.Create(cli) testNs = testNamespace.Name - // Create ray tls resources in the ctrl namespace + // Create ray tls script in the ctrl namespace rayTlsScriptsConfigMap := &corev1.ConfigMap{} err := convertToStructuredResource(rayTlsScriptsPath, rayTlsScriptsConfigMap) Expect(err).NotTo(HaveOccurred()) rayTlsScriptsConfigMap.SetNamespace(WorkingNamespace) Expect(cli.Create(ctx, rayTlsScriptsConfigMap)).Should(Succeed()) - rayCaCertSecret := &corev1.Secret{} - err = convertToStructuredResource(rayCaCertPath, rayCaCertSecret) - Expect(err).NotTo(HaveOccurred()) - rayCaCertSecret.SetNamespace(WorkingNamespace) - Expect(cli.Create(ctx, rayCaCertSecret)).Should(Succeed()) - // Create a multinode ServingRuntime in the testNs multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err = convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) @@ -266,10 +242,10 @@ var _ = Describe("KServe Ray TLS controller", func() { _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, testNs, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray tls resources should not be removed if there is a multinode ServingRuntime in the testNs", func() { + It("ray tls script should not be removed if there is a multinode ServingRuntime in the testNs", func() { By("creating another multinode ServingRuntime in the testNs") // Create another multinode ServingRuntime multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} @@ -282,13 +258,13 @@ var _ = Describe("KServe Ray TLS controller", func() { By("deleting one multinode ServingRuntime in the testNs") Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray tls resources are NOT removed + // Check if all ray tls script are NOT removed _, err = waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) - _, err = waitForSecret(cli, testNs, constants.RayCATlsSecretName, 30, 1*time.Second) + _, err = waitForSecret(cli, testNs, constants.RayCASecretName, 30, 1*time.Second) Expect(err).NotTo(HaveOccurred()) }) - It("ray tls resources should be removed if there is no multinode ServingRuntime in the testNs", func() { + It("ray tls script should be removed if there is no multinode ServingRuntime in the testNs", func() { By("deleting a multinode ServingRuntime") multinodeServingRuntime := &kservev1alpha1.ServingRuntime{} err := convertToStructuredResource(multinodeServingRuntimePath, multinodeServingRuntime) @@ -296,12 +272,12 @@ var _ = Describe("KServe Ray TLS controller", func() { multinodeServingRuntime.SetNamespace(testNs) Expect(cli.Delete(ctx, multinodeServingRuntime)).Should(Succeed()) - // Check if all ray tls resources are removed + // Check if all ray tls script are removed configmap, err := waitForConfigMap(cli, testNs, constants.RayTlsScriptConfigMapName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(configmap).To(BeNil()) - secret, err := waitForSecret(cli, testNs, constants.RayCATlsSecretName, 3, 1*time.Second) + secret, err := waitForSecret(cli, testNs, constants.RayCASecretName, 3, 1*time.Second) Expect(err).To(HaveOccurred()) Expect(secret).To(BeNil()) }) diff --git a/controllers/testdata/configmaps/ray-tls-scripts-updated.yaml b/controllers/testdata/configmaps/ray-tls-script-updated.yaml similarity index 94% rename from controllers/testdata/configmaps/ray-tls-scripts-updated.yaml rename to controllers/testdata/configmaps/ray-tls-script-updated.yaml index 77491ceb..a6271639 100644 --- a/controllers/testdata/configmaps/ray-tls-scripts-updated.yaml +++ b/controllers/testdata/configmaps/ray-tls-script-updated.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: ray-tls-scripts + name: ray-tls-script labels: opendatahub.io/managed: 'true' data: @@ -64,7 +64,7 @@ data: ## Generate tls.cert openssl x509 -req \ -in /etc/ray/tls/ca.csr \ - -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CA /etc/ca/tls/tls.crt -CAkey /etc/ca/tls/tls.key \ -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ -days 36500 \ -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/controllers/testdata/configmaps/ray-tls-scripts.yaml b/controllers/testdata/configmaps/ray-tls-script.yaml similarity index 94% rename from controllers/testdata/configmaps/ray-tls-scripts.yaml rename to controllers/testdata/configmaps/ray-tls-script.yaml index df4c00ce..905d0ead 100644 --- a/controllers/testdata/configmaps/ray-tls-scripts.yaml +++ b/controllers/testdata/configmaps/ray-tls-script.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: ray-tls-scripts + name: ray-tls-script labels: opendatahub.io/managed: 'true' data: @@ -63,7 +63,7 @@ data: ## Generate tls.cert openssl x509 -req \ -in /etc/ray/tls/ca.csr \ - -CA /etc/ca/tls/ca.crt -CAkey /etc/ca/tls/ca.key \ + -CA /etc/ca/tls/tls.crt -CAkey /etc/ca/tls/tls.key \ -CAserial /tmp/ca.srl -out /etc/ray/tls/tls.crt \ -days 36500 \ -sha256 -extfile /etc/ray/tls/cert.conf diff --git a/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml b/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml index acfeb4fc..cc95baab 100644 --- a/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml +++ b/controllers/testdata/deploy/vllm-multinode-servingruntime.yaml @@ -41,7 +41,7 @@ spec: - name: RAY_TLS_SERVER_KEY value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: '/etc/ca/tls/ca.crt' + value: '/etc/ca/tls/tls.crt' - name: RAY_PORT value: '6379' - name: RAY_ADDRESS @@ -179,7 +179,7 @@ spec: # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: ray-tls-scripts + name: ray-tls-script defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: @@ -227,7 +227,7 @@ spec: - name: RAY_TLS_SERVER_KEY value: '/etc/ray/tls/tls.key' - name: RAY_TLS_CA_CERT - value: '/etc/ca/tls/ca.crt' + value: '/etc/ca/tls/tls.crt' - name: POD_NAME valueFrom: fieldRef: @@ -312,7 +312,7 @@ spec: # The gencert_ray.sh can be prebaked into the docker container so the configMap is optional - name: gen-tls-script configMap: - name: ray-tls-scripts + name: ray-tls-script defaultMode: 0777 # An array of keys from the ConfigMap to create as files items: diff --git a/controllers/testdata/secrets/ray-ca-cert-updated.yaml b/controllers/testdata/secrets/ray-ca-cert-updated.yaml index b590c941..ef71b289 100644 --- a/controllers/testdata/secrets/ray-ca-cert-updated.yaml +++ b/controllers/testdata/secrets/ray-ca-cert-updated.yaml @@ -1,13 +1,14 @@ apiVersion: v1 kind: Secret +type: kubernetes.io/tls metadata: name: ray-ca-cert labels: opendatahub.io/managed: 'true' data: - # output from cat ca.crt | base64 - ca.crt: | + # output from cat tls.crt | base64 + tls.crt: | LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT1URVNUCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K - # output from cat ca.key | base64 - ca.key: | + # output from cat tls.key | base64 + tls.key: | LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT1URVNUCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K diff --git a/controllers/testdata/secrets/ray-ca-cert.yaml b/controllers/testdata/secrets/ray-ca-cert.yaml deleted file mode 100644 index 77335ebf..00000000 --- a/controllers/testdata/secrets/ray-ca-cert.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: ray-ca-cert - labels: - opendatahub.io/managed: 'true' -data: - # output from cat ca.crt | base64 - ca.crt: | - LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZIekNDQXdlZ0F3SUJBZ0lVSlAwL1FCY0xTMFFFV1ZiRDE4NndyUnZjLzdFd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RjMlZzWmkxemFXZHVaV1F0WTJFdFkyVnlkREFnRncweU5ERXhNak13TXpReApOVEphR0E4eU1USTBNVEF6TURBek5ERTFNbG93SGpFY01Cb0dBMVVFQXd3VGMyVnNaaTF6YVdkdVpXUXRZMkV0ClkyVnlkRENDQWlJd0RRWUpLb1pJaHZjTkFRRUJCUUFEZ2dJUEFEQ0NBZ29DZ2dJQkFML0lpWVdabUc0Q05qOS8KMzV6RUJheU1tL2djeGhJSFArZ1M4S04wdm9YV2krdk1Kd1hEUWR3T0JOdEZaa1l0elpJc29ESHVHenRnN0RWNApyZXZONE5JOHRKTmc2b2Jma0tVcGQ3eHdvNHNIdXMwd0h6NWVwZGt1MDhNVzVzZFZOUFNRMkhpVnhialpXSnBRClpJYWYyQWRKa1psUFdtVDBaS1pPdFFEN2oySTRtM3VCeG1jYzhTNWhiNkpaYW9NbGNVVXhkNDFscG43T21iMGEKVjRBUGZiWS9vYytwZmVDczN1cG5xamxZamVGQjR2RTV4WU1ZV0FNeitJRGh4RTRxRGVSaXNMQnhhN1kvcFRScQo2OWVhVXN6Qjl5eEQ3R0FySTJsSDhyUCtVeGpGYUl1K2tBVjVtbjc4OXdlejh0TDVGNEErWlE5cGM5TVI2UXBuCmRkanlaRXcvcFpkdTcwVmo3WUE1MU91S2owcTF2dGw3d1BPcDBUc3lwUDhadW04ZkZSNG5KbmNPaFhMTWV3bGEKTWxBeFZaUWRiMEF5dEE2TUl0dFdXSjA5L3BEOWZ6SkdjRnZOL245ZzZWZ0o5NjNhbmRoTEwyYlJMc0VKRmxUTQpEdTJIeW1CNkErb0ZlcDdjZXNxOUpJRFhkVmFqR3NxMmgrZVpPNGdxWW5nWGNmQVl5ZUloYzlYNnFoT2QvVmlZCmY2eUZoOTNuUnRYTFFNdUJRN2E1WTFzRVN3RHp3WWJKdEtuK3NrcGg5SEtCMTdVRUVOU3BNNHJSNHdxekRQd3AKSmZZeWt2a2Iyd2w0TkNCb2pjaU9icDYwV2ZDQytRcTFsNEo4VXpSOEpvWmFiQ0IzOWxVcHBKa09qNVFxYnEwMApKaUFzWENQQlp3OCtnQnV0b2JBVUs0RklqMGVQQWdNQkFBR2pVekJSTUIwR0ExVWREZ1FXQkJTcmNlRWNhMjNxCjBUQm14VmtZTWtMQTNSeHpKekFmQmdOVkhTTUVHREFXZ0JTcmNlRWNhMjNxMFRCbXhWa1lNa0xBM1J4ekp6QVAKQmdOVkhSTUJBZjhFQlRBREFRSC9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUNBUUJseW5IM2doVG9ObDQrTlQ1MgpNTTA1V1A3UCszVXJkQ0tGNEJCa0VzN0VueldSUjZ4bkVoVUY5VWhGZ0ZhTFBiQ1pacnlCS2krT1hrUHUva3JCCk12aE1LVGl0WnNWbVBzRktEWDYyVG9zMEJZV0VzanZ0VDM4WFhSZXA3T3BWR0lPQi85V09YVGl3VkpaT2tSZ2MKbHd3U2U1dnBQQXRpMzhUZ3BhM0FVSk5haG00bDhHNWF5WktRQWFnUDg1NHBFTjhPOW54Nk9odytWN1hzSGlNdQpwUmpvc0VTN0JJY1lXVGJxR05yNFR3eXo1cVUwOE9LOEUySFNFSnE5THA4YzZ6UTZnZzBhV1dLYWJyTUpNeSt2CkpIbjM5TEI0U3dONzJjVXJkRWYvQVVrWktNYTRKVFRjMTJnaGpKN1JvUENYUFdPOUJGZ09aZEdoUlpBYkZRMXgKcnB6b3BLZllkT0hWZ0tncG9MOVJIRm40TzZRaTBjbnBML0NZZEFGd0pXNmZYcGEyekhobEJqWXlWdHk5T1Y4TQppV01IVUNXZnl4anVaSno5NFFWZGxLRGVrY2YzUFJzU0RBRGZ4TXlBYVdJQ1NnYXNHTVNPSnRoRTlGM0JhaXNvCnNYM1NLYzRFSEc4Sk1VM1hoeWYwbkhDY2hQdWVRblU5akFBVVBDMHRrVlZtOWhmMXVkdjlOTUk4bktncWRFMkMKK2ExNnR3RVBpZzhkS1pkaFRMOFdXMGwxS3FRcCs4SnBGdTdNWUM1SGNaa0F0NEE3QXlXOHRsYmlQQ1B4RVYwZwpsYkc0eXFyV2lIOG5rWE9tNFBKb0FEMDhzNjA5Y3lFY2Iyblgvck92KzBSdFdOOWFqZGNxYlM1Z0JJaEhRSVUwCko1N0cyTVFYZ0hHbU9QL1ZZTi8vMTlsaXd3PT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - # output from cat ca.key | base64 - ca.key: | - LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUpRd0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQ1Mwd2dna3BBZ0VBQW9JQ0FRQy95SW1GbVpodUFqWS8KZjkrY3hBV3NqSnY0SE1ZU0J6L29FdkNqZEw2RjFvdnJ6Q2NGdzBIY0RnVGJSV1pHTGMyU0xLQXg3aHM3WU93MQplSzNyemVEU1BMU1RZT3FHMzVDbEtYZThjS09MQjdyTk1COCtYcVhaTHRQREZ1YkhWVFQwa05oNGxjVzQyVmlhClVHU0duOWdIU1pHWlQxcGs5R1NtVHJVQSs0OWlPSnQ3Z2NabkhQRXVZVytpV1dxREpYRkZNWGVOWmFaK3pwbTkKR2xlQUQzMjJQNkhQcVgzZ3JON3FaNm81V0kzaFFlTHhPY1dER0ZnRE0vaUE0Y1JPS2cza1lyQ3djV3UyUDZVMAphdXZYbWxMTXdmY3NRK3hnS3lOcFIvS3ovbE1ZeFdpTHZwQUZlWnArL1BjSHMvTFMrUmVBUG1VUGFYUFRFZWtLClozWFk4bVJNUDZXWGJ1OUZZKzJBT2RUcmlvOUt0YjdaZThEenFkRTdNcVQvR2Jwdkh4VWVKeVozRG9WeXpIc0oKV2pKUU1WV1VIVzlBTXJRT2pDTGJWbGlkUGY2US9YOHlSbkJiemY1L1lPbFlDZmV0MnAzWVN5OW0wUzdCQ1JaVQp6QTd0aDhwZ2VnUHFCWHFlM0hyS3ZTU0ExM1ZXb3hyS3RvZm5tVHVJS21KNEYzSHdHTW5pSVhQVitxb1RuZjFZCm1IK3NoWWZkNTBiVnkwRExnVU8ydVdOYkJFc0E4OEdHeWJTcC9ySktZZlJ5Z2RlMUJCRFVxVE9LMGVNS3N3ejgKS1NYMk1wTDVHOXNKZURRZ2FJM0lqbTZldEZud2d2a0t0WmVDZkZNMGZDYUdXbXdnZC9aVkthU1pEbytVS202dApOQ1lnTEZ3andXY1BQb0FicmFHd0ZDdUJTSTlIandJREFRQUJBb0lDQUFyYzkwaG9ud3VIWGI3ZmNtU0IxU3JZClZPWWt1WDl6aHQvRWxIb1E5cDNFSSswNWhWaFdCTmpMNjBvYXRuRlhtenk3emZtTWMyRTcyemlPam1OdmpvOGcKY1l4eDlMYmQycG5RWUlBWEJ0eDV5UUxJWUFaSUwySys3NjloRUlLYksvVzQxZG9wN05vekFMQm9MMW1FenlSZgpWS0hFU0ZDMHptS3hNOUpMYllYeWowMm9QbUhBY0NHdGJHdjFrZGZ4RkdjNldrZy80c0tnY05ld3NueUdTb0lICm8zd21ZSnkvSjUxTDF5QlhPL2J2Y1hobHNMd3djamNCQ0FNUUU0aE42UjJKUUwrdDBEWGt2SjBQcnZzRE9wa0kKakdzTlEzMWVPcEpERmdwL21zNlFNWnpObHhwdXNGQTVnNUNkaUpRMHNkSGpOdUtqTXhyeUxKRk1HY0l0OExEQwpRVzF2akxLR0l1UWtraGwxOWU1S1N2SDdjUjJja0pDME5vTzhnekpudzd0dTRGaHJaK0xQeXF2R3VSYU55a2RmCi9BKzNEOUE2RW1PNWRldFU2RzJkK0l2TmprdG91Z05UalZIUklDbk9oL01zRmlFQXdycHltVVNISzhKTjVpSjIKUm1rNFljNWlXUjhOUWs0Wkh6aVFGSHJSWkh0TW9DcEkvR2ZGcnYyRVE0bFpOOG5tZHdDWDR4a3JObUJ2ZnlIdgpLWW0yMU5VWDc5U3lRbHd5VS9lNUR6eTA3Si9zcTdoVU8xN3hVcXNzTVpZTGZCSFF0VFU0VVAwbnBOZmtxUFM1CjdJRUtIVWwyRlZudXR0THoxc0ZVVHhJTS90aE9lczRtWElrOExiYzI0Yk45VWlteXplVEN1bE83a0hZSDhTVkEKZDJqZFBTZXhZSTdMeWFVNnFHRzVBb0lCQVFEbHlVQk5CaTRNekdxVnh5NjNCY1dyZC9rdVYrYTFLQ3MyYVhzagpLbVlMT0xrSkhUSjI0YU9EWkJBVGxEMllwclZEOUM1UThTeDdPQVdlQ3FqWHd1MndsOTlabXNIVkxiQTZMRUZ6CnBoYTNQVHhkaWFpMElwZVY2ZFpIQnQrdjVDVGsvSnpxSVpjc1J2REFnNTFHYzgxbERxTzFNbnVqbldBcGpSMmMKd05ZVXd6a3hicHVTc1ZHYzFBZ09tVHBHN1MrdWVVQ2FGU0NVaGkyVWVoblM5dkNrU3Y0QTZMRlpiaXhFeWp6aApycU9mN1d1TTVUWkFoTGM2RTJUQnVOeWJlWW9DblFMdHF3dnUxaFhDOGU4TGlQWFRlMVJ4U2x5dXA5RDhiWEZBCjVPVmFUZjAzcFFweURiOXNKeGhLN3FMbUgrSjlUeU5JamhTTUZQT2pKNEJFRTlPOUFvSUJBUURWcVhDMGdCVzUKYlNUWmUzc3l1QVltRi9hVDg1ZFh1NGFTMFBJR09MakE1M2h0RVdLUkJxd1JlU1prSFdtR05uNUIyOXVXTHg2UgpPZjFNOFJkY2NYSnlxMnp1TlBiWkpabllwS0x5N0FjeDBpc1RvMjdpUy9xRS85YndsNUo3QVU5UmZ2K2ZMK2RPCmxqUndRTGUvQ1dSVHVlTlNOSWpPUC96NWRra2J5Z1kvWHZHbmI0RUJheDY4K3J2a0NYbStGdFpXV3VoblM2Uy8KZHh3Ulo2VGRMd09RZTZQSzNzN3F4c2xWNmQ2dmwrSUpwa1VVZmRvWDNyWFlTeGx2cFlQYWJpWEpaVzdQWkZwRQpVQXc0VTFpSzVLMUt5d1ZjaHlhN2tQSlpRNUplS1pUT3lPL1d5ODZLak0vcUd3NUhDR2NOL2VMbDJKUUViUkwvClJiR0pGSmhUalpjN0FvSUJBQXlyNG0zYzcyRXBUSjloMG9PcFA5TksxR1RuMkFNWmFmaWdMSGd0K0Y2YURDb2kKZ0F2cU9YZ2ZabnVONnkrbDBjMGpoQUpXcWx0SkpaWW5oRlFSbmNYbE9oM1kyT09HbDNjOXhZWTVISHVTVnVmWgpsWUlKZms1NERLYnlEQmZJL3ZmWnJsV0M4TEV5WUVoZGVhak83ZjZxcGdCeC9qdHhqRUgrVkNtMndKZDRoSWpqClRwVHlUa3ZWclhRUW94UVNORlRzdnRGQVpRR0x2S3U1Wi84b092RDBhYmxuRzVDUThNUUNXd1VlK2tyeGJzTGcKU1BPWjNmakg1UUNCenppTHBUNnJwZU94VVFFa3NTS0U4T2V6NzhwdnZLSmF0VzIwTjJRVUxQQ2xMcmlpSUZxWApNVkpFeTgrTkFGdnhlTzR6eCt1ZEY1Y0Nyc05pekdTczR2ZmVHQWtDZ2dFQkFMdFRnbWdPd0gxQlR3U0t1Ym4vCkZBMEVCNEV5R2FlbTExY1RjSTY1M21ucXgyL0F4VTFucnlibXRCMGttR2Mra2JYR1FDRE5rUnc4M25NK0VZQlEKU3NwMHQ5MmxmQ05vVHhsZFJ5eDZlZGhaYnNFYUVsYS96SllkQk9NTjBUU2RNbUMrV3ZuRGN5WTRsU014NnFmSQpZVGp6Q25ZQmIweDlWNXVUOUljenVnU0hocEdKTm03Njd3azdQODZ2N0JnWVI3V1FvS0FuOXZxVFFIMldCRHFVClJLakJiaHFvL0h0azdCS3lLRGFGa0gxclZMZWhtN3cvMitrVjl1Z25FcEpJN2tKRDkwSkh0c2liOGdyVU1CWWUKWmp6a0FRQmQwanl5MlhnZndVMWpZWDluTnJoNUdjM3BwVVNZa2d6L05mTlRmRUtPZnovZUxjQzM1dTdMcXIzZQpydzhDZ2dFQkFMT2tsTkJNRVBmM20yTXBjaVRjRmNKb08vZzBMUUpHaTJtWkN6S1g3eDJFS0N2N1ZvVWVtRkk0CjRkRFVmSlBJWlBFTUpkTHRSUy9qUDEyZWkxek9lWHIrVGlUTklpUUVoemRtL0RZWUdjd2hyb0xLNDZVTFJKY0YKYzdxZ2xNQ1Z1MW9DTmtDdTJvZ08renczRm9makJzK1pqcE1BS2kyOTZ2ZDk2YVlYNThYR0RKekdmdjhuZEF1dwpEUmU1ZE5oQU5iaHZqSlM1VXJwNnhoMVMycTNYOHorTlFGWW9CNDM1Q2NXNW50WWMzemIxYVdzY0NxMWJsUGJGCjc0QTFLTHJNNlpvU0ZlcUVWZzhvajhpWjlDaitiTTJXYm9BREIvRTROM0kyNmFDK1dDRWxtdTd3ZDdQaExQT2IKN3RrTXh2Zm10dDE5T2dYbTRKZm9SZWlkMTNYbHFoZz0KLS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQo= diff --git a/controllers/utils/cert.go b/controllers/utils/cert.go new file mode 100644 index 00000000..ee799ee2 --- /dev/null +++ b/controllers/utils/cert.go @@ -0,0 +1,171 @@ +package utils + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "time" + + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + k8serr "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func CreateSelfSignedCertificate(ctx context.Context, c client.Client, secretName, domain, namespace string) error { + certSecret, err := GenerateSelfSignedCertificateAsSecret(secretName, domain, namespace) + if err != nil { + return fmt.Errorf("failed generating self-signed certificate: %w", err) + } + + if errGen := generateCertSecret(ctx, c, certSecret); errGen != nil { + return fmt.Errorf("failed update self-signed certificate secret: %w", errGen) + } + + return nil +} + +func GenerateSelfSignedCertificateAsSecret(name, addr, namespace string) (*corev1.Secret, error) { + cert, key, err := generateCertificate(addr) + if err != nil { + return nil, errors.WithStack(err) + } + + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + "opendatahub.io/managed": "true", + "app.kubernetes.io/name": "odh-model-controller", + "app.kubernetes.io/component": "kserve", + "app.kubernetes.io/part-of": "odh-model-serving", + "app.kubernetes.io/managed-by": "odh-model-controller", + }, + }, + Data: map[string][]byte{ + corev1.TLSCertKey: cert, + corev1.TLSPrivateKeyKey: key, + }, + Type: corev1.SecretTypeTLS, + }, nil +} + +func generateCertificate(addr string) ([]byte, []byte, error) { + key, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + return nil, nil, errors.WithStack(err) + } + + seededRand, cryptErr := rand.Int(rand.Reader, big.NewInt(time.Now().UnixNano())) + if cryptErr != nil { + return nil, nil, errors.WithStack(cryptErr) + } + + now := time.Now() + tmpl := x509.Certificate{ + SerialNumber: seededRand, + Subject: pkix.Name{ + CommonName: addr, + Organization: []string{"serving-self-signed"}, + }, + NotBefore: now.UTC(), + NotAfter: now.Add(time.Second * 60 * 60 * 24 * 365 * 10).UTC(), + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IsCA: true, + } + + certDERBytes, err := x509.CreateCertificate(rand.Reader, &tmpl, &tmpl, key.Public(), key) + if err != nil { + return nil, nil, errors.WithStack(err) + } + certificate, err := x509.ParseCertificate(certDERBytes) + if err != nil { + return nil, nil, errors.WithStack(err) + } + + certBuffer := bytes.Buffer{} + if err := pem.Encode(&certBuffer, &pem.Block{ + Type: "CERTIFICATE", + Bytes: certificate.Raw, + }); err != nil { + return nil, nil, errors.WithStack(err) + } + + keyBuffer := bytes.Buffer{} + if err := pem.Encode(&keyBuffer, &pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(key), + }); err != nil { + return nil, nil, errors.WithStack(err) + } + + return certBuffer.Bytes(), keyBuffer.Bytes(), nil +} + +// recreateSecret deletes the existing secret and creates a new one. +func recreateSecret(ctx context.Context, c client.Client, existingSecret, newSecret *corev1.Secret) error { + if err := c.Delete(ctx, existingSecret); err != nil { + return fmt.Errorf("failed to delete existing secret before recreating new one: %w", err) + } + if err := c.Create(ctx, newSecret); err != nil { + return fmt.Errorf("failed to create new secret after existing one has been deleted: %w", err) + } + return nil +} + +// generateCertSecret creates a secret if it does not exist; recreate this secret if type not match; update data if outdated. +func generateCertSecret(ctx context.Context, c client.Client, certSecret *corev1.Secret) error { + existingSecret := &corev1.Secret{} + errGet := c.Get(ctx, client.ObjectKeyFromObject(certSecret), existingSecret) + switch { + case errGet == nil: + // Secret exists but with a different type, delete and create it again + if existingSecret.Type != certSecret.Type { + return recreateSecret(ctx, c, existingSecret, certSecret) + } + // update data if found with same type but outdated content + if isSecretOutdated(existingSecret.Data, certSecret.Data) { + if errUpdate := c.Update(ctx, certSecret); errUpdate != nil { + return fmt.Errorf("failed to update existing secret: %w", errUpdate) + } + } + case k8serr.IsNotFound(errGet): + // Secret does not exist, create it + if errCreate := c.Create(ctx, certSecret); errCreate != nil { + return fmt.Errorf("failed creating new certificate secret: %w", errCreate) + } + default: + return fmt.Errorf("failed getting certificate secret: %w", errGet) + } + + return nil +} + +// isSecretOutdated compares two secret data of type map[string][]byte and returns true if they are not equal. +func isSecretOutdated(existingSecretData, newSecretData map[string][]byte) bool { + if len(existingSecretData) != len(newSecretData) { + return true + } + + for key, value1 := range existingSecretData { + value2, ok := newSecretData[key] + if !ok { + return true + } + if !bytes.Equal(value1, value2) { + return true + } + } + + return false +} From 64fe75bdeaa6b4bf9f7ac27f9be313e6ef8873fb Mon Sep 17 00:00:00 2001 From: jooho lee Date: Thu, 5 Dec 2024 09:55:30 -0500 Subject: [PATCH 8/8] update label for cert Signed-off-by: jooho lee --- controllers/utils/cert.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/controllers/utils/cert.go b/controllers/utils/cert.go index ee799ee2..c4e34c81 100644 --- a/controllers/utils/cert.go +++ b/controllers/utils/cert.go @@ -44,9 +44,9 @@ func GenerateSelfSignedCertificateAsSecret(name, addr, namespace string) (*corev Namespace: namespace, Labels: map[string]string{ "opendatahub.io/managed": "true", - "app.kubernetes.io/name": "odh-model-controller", - "app.kubernetes.io/component": "kserve", - "app.kubernetes.io/part-of": "odh-model-serving", + "app.kubernetes.io/name": "self-signed-cert", + "app.kubernetes.io/component": "odh-model-serving", + "app.kubernetes.io/part-of": "kserve", "app.kubernetes.io/managed-by": "odh-model-controller", }, },