opea-project · xwu99 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
@@ -0,0 +1,34 @@
+# Deploy the KubeRay operator with the Helm chart
+
+```bash
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+helm repo update
+
+# Install both CRDs and KubeRay operator v1.1.1.
+helm install kuberay-operator kuberay/kuberay-operator --version 1.1.1
+
+# Confirm that the operator is running in the namespace `default`.
+kubectl get pods
+# NAME                                READY   STATUS    RESTARTS   AGE
+# kuberay-operator-7fbdbf8c89-pt8bk   1/1     Running   0          27s
+```
+
+# Deploy On Xeon
+
+```bash
+cd GenAIInfra/manifests/FineTuning/xeon
+kubectl apply -f finetuning-ray-cluster-autoscaler.yaml
+kubectl apply -f finetuning.yaml
+```
+
+# Deploy On Gaudi
+
+TBD
+
+# Verify LLM Fine-tuning Service
+
+Make sure all the pods are running.
+
+```bash
+kubectl get pods
+```
@@ -0,0 +1,140 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: raycluster-autoscaler
+spec:
+  # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
+  rayVersion: '2.23.0'
+  # If `enableInTreeAutoscaling` is true, the Autoscaler sidecar will be added to the Ray head pod.
+  # Ray Autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
+  enableInTreeAutoscaling: true
+  # `autoscalerOptions` is an OPTIONAL field specifying configuration overrides for the Ray Autoscaler.
+  # The example configuration shown below below represents the DEFAULT values.
+  # (You may delete autoscalerOptions if the defaults are suitable.)
+  autoscalerOptions:
+    # `upscalingMode` is "Default" or "Aggressive."
+    # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
+    # Default: Upscaling is not rate-limited.
+    # Aggressive: An alias for Default; upscaling is not rate-limited.
+    upscalingMode: Default
+    # `idleTimeoutSeconds` is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
+    idleTimeoutSeconds: 60
+    # `image` optionally overrides the Autoscaler's container image. The Autoscaler uses the same image as the Ray container by default.
+    ## image: "my-repo/my-custom-autoscaler-image:tag"
+    # `imagePullPolicy` optionally overrides the Autoscaler container's default image pull policy (IfNotPresent).
+    image: opea-finetune:latest
+    imagePullPolicy: IfNotPresent
+    # Optionally specify the Autoscaler container's securityContext.
+    securityContext: {}
+    env: []
+    envFrom: []
+    # resources specifies optional resource request and limit overrides for the Autoscaler container.
+    # The default Autoscaler resource limits and requests should be sufficient for production use-cases.
+    # However, for large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+    resources:
+      limits:
+        cpu: "500m"
+        memory: "512Mi"
+      requests:
+        cpu: "500m"
+        memory: "512Mi"
+  # Ray head pod template
+  headGroupSpec:
+    # The `rayStartParams` are used to configure the `ray start` command.
+    # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+    # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+    rayStartParams:
+      # Setting "num-cpus: 0" to avoid any Ray actors or tasks being scheduled on the Ray head Pod.
+      num-cpus: "0"
+      # Use `resources` to optionally specify custom resource annotations for the Ray node.
+      # The value of `resources` is a string-integer mapping.
+      # Currently, `resources` must be provided in the specific format demonstrated below:
+      # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
+    # Pod template
+    template:
+      spec:
+        containers:
+        # The Ray head container
+        - name: ray-head
+          image: opea-finetune:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+          - containerPort: 6379
+            name: gcs
+          - containerPort: 8265
+            name: dashboard
+          - containerPort: 10001
+            name: client
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh","-c","ray stop"]
+          resources:
+            limits:
+              cpu: "1"
+              memory: "2G"
+            requests:
+              cpu: "1"
+              memory: "2G"
+          volumeMounts:
+            - name: k8s-data
+              mountPath: /root/k8s-data
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface/
+        volumes:
+          - name: k8s-data
+            hostPath:
+              path: /root/k8s-data
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface/
+
+  workerGroupSpecs:
+  # the Pod replicas in this group typed worker
+  - replicas: 0
+    minReplicas: 0
+    maxReplicas: 10
+    # logical group name, for this called small-group, also can be functional
+    groupName: small-group
+    # If worker pods need to be added, Ray Autoscaler can increment the `replicas`.
+    # If worker pods need to be removed, Ray Autoscaler decrements the replicas, and populates the `workersToDelete` list.
+    # KubeRay operator will remove Pods from the list until the desired number of replicas is satisfied.
+    #scaleStrategy:
+    #  workersToDelete:
+    #  - raycluster-complete-worker-small-group-bdtwh
+    #  - raycluster-complete-worker-small-group-hv457
+    #  - raycluster-complete-worker-small-group-k8tj7
+    rayStartParams: {}
+    # Pod template
+    template:
+      spec:
+        containers:
+        - name: ray-worker
+          image: opea-finetune:latest
+          imagePullPolicy: IfNotPresent
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh","-c","ray stop"]
+          resources:
+            limits:
+              cpu: "35"
+              memory: "32G"
+            requests:
+              cpu: "35"
+              memory: "32G"
+          volumeMounts:
+            - name: k8s-data
+              mountPath: /root/k8s-data
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface/
+        volumes:
+          - name: k8s-data
+            hostPath:
+              path: /root/k8s-data
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface/
@@ -0,0 +1,36 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+   name: opea-finetune
+   namespace: default
+spec:
+   replicas: 1
+   selector:
+      matchLabels:
+         service: opea-finetune
+   template:
+      metadata:
+         labels:
+            service: opea-finetune
+      spec:
+         containers:
+            - name: opea-finetune
+              image: opea-finetune:latest
+              imagePullPolicy: IfNotPresent
+---
+apiVersion: v1
+kind: Service
+metadata:
+   name: opea-finetune-entrypoint
+   namespace: default
+spec:
+   type: NodePort
+   selector:
+      service: opea-finetune
+   ports:
+      - port: 8000
+        targetPort: 8000
+        nodePort: 30001