From df96ffd84fbd9c2b767ade44b944be8b3ded036f Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Fri, 6 Sep 2024 12:07:56 +0100 Subject: [PATCH 1/5] Add devflag printout to GH Action comment (#289) --- .github/workflows/build-and-push.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml index 835fa146..85158a22 100644 --- a/.github/workflows/build-and-push.yaml +++ b/.github/workflows/build-and-push.yaml @@ -126,4 +126,12 @@ jobs: 📦 [PR image](https://quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}` 🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }}) + + ``` + devFlags: + manifests: + - contextDir: config + sourcePath: '' + uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }} + ``` From f366b6eabd56b5d635f3aaa157c1c4ca4d0bd4a8 Mon Sep 17 00:00:00 2001 From: Rob Geada Date: Mon, 30 Sep 2024 12:31:15 +0100 Subject: [PATCH 2/5] Add timeout loop to DSC install (#305) --- tests/scripts/install.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/scripts/install.sh b/tests/scripts/install.sh index cc47a549..06eb6ccf 100755 --- a/tests/scripts/install.sh +++ b/tests/scripts/install.sh @@ -128,13 +128,23 @@ else echo "Creating the following DSC" cat ./${DSC_FILENAME} > ${ARTIFACT_DIR}/${DSC_FILENAME} - oc apply -f ./odh-core-dsci.yaml - oc apply -f ./${DSC_FILENAME} - kfctl_result=$? - if [ "$kfctl_result" -ne 0 ]; then + start_t=$(date +%s) 2>&1 + ready=1 2>&1 + while [ "$ready" -ne 0 ]; do + oc apply -f ./odh-core-dsci.yaml + oc apply -f ./${DSC_FILENAME} + ready=$? + if [ $(($(date +%s)-start_t)) -gt 300 ]; then + echo "ODH DSC Installation timeout" + exit 1 + fi + sleep 10 + done + + if [ "$ready" -ne 0 ]; then echo "The installation failed" - exit $kfctl_result + exit $ready fi fi set +x From c072d519197b8a950f8897ed234384a1fc2227aa Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 30 Sep 2024 20:05:14 +0100 Subject: [PATCH 3/5] RHOAIENG-13625: Add DBAvailable status to CR (#304) * Add DBAvailable status to CR * Remove probes --- README.md | 10 ++++ config/base/params.env | 2 +- config/overlays/odh/params.env | 2 +- controllers/constants.go | 15 +++++ controllers/database.go | 63 ++++++++++++++++++++ controllers/deployment.go | 27 ++++++++- controllers/statuses.go | 59 ++++++++++++++----- controllers/statuses_test.go | 11 ++-- controllers/suite_test.go | 70 +++++++++++++++++++++-- controllers/trustyaiservice_controller.go | 19 ++++++ 10 files changed, 249 insertions(+), 29 deletions(-) create mode 100644 controllers/database.go diff --git a/README.md b/README.md index aa49524e..14f1a38b 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai | `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. | | `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. | +#### Database Status + +| Status Type | Status Reason | Description | +|---------------|-------------------------|---------------------------------------------------| +| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found | +| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) | +| `DBAvailable` | `DBConnectionError` | Service error connecting to the database | +| `DBAvailable` | `DBAvailable` | Successfully connected to the database | + #### Status Behavior - If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`. +- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready` - However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so. ## Contributing diff --git a/config/base/params.env b/config/base/params.env index a0f5419b..d745ad2a 100644 --- a/config/base/params.env +++ b/config/base/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=disabled \ No newline at end of file +kServeServerless=disabled diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index 908f07c9..c67b2b5c 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,4 +1,4 @@ trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 -kServeServerless=enabled \ No newline at end of file +kServeServerless=enabled diff --git a/controllers/constants.go b/controllers/constants.go index 2c7081b0..18fdb4c5 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -46,6 +46,7 @@ const ( StatusTypePVCAvailable = "PVCAvailable" StatusTypeRouteAvailable = "RouteAvailable" StatusTypeAvailable = "Available" + StatusTypeDBAvailable = "DBAvailable" ) // Status reasons @@ -58,6 +59,10 @@ const ( StatusReasonRouteFound = "RouteFound" StatusAvailable = "AllComponentsReady" StatusNotAvailable = "NotAllComponentsReady" + StatusDBCredentialsNotFound = "DBCredentialsNotFound" + StatusDBCredentialsError = "DBCredentialsError" + StatusDBConnectionError = "DBConnectionError" + StatusDBAvailable = "DBAvailable" ) // Event reasons @@ -67,4 +72,14 @@ const ( EventReasonServiceMonitorCreated = "ServiceMonitorCreated" ) +const ( + StateReasonCrashLoopBackOff = "CrashLoopBackOff" +) + +// Phases +const ( + PhaseReady = "Ready" + PhaseNotReady = "Not Ready" +) + const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration" diff --git a/controllers/database.go b/controllers/database.go new file mode 100644 index 00000000..96679d01 --- /dev/null +++ b/controllers/database.go @@ -0,0 +1,63 @@ +package controllers + +import ( + "context" + "strings" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues. +func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + + for _, cond := range deployment.Status.Conditions { + if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == "trustyai-service" { + if cs.State.Running != nil { + return true, nil + } + + if cs.LastTerminationState.Terminated != nil { + termination := cs.LastTerminationState.Terminated + if termination.Reason == "Error" && termination.Message != "" { + if strings.Contains(termination.Message, "Socket fail to connect to host:address") { + return false, nil + } + } + } + + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + } + } + } + } + } + + return false, nil +} diff --git a/controllers/deployment.go b/controllers/deployment.go index 81be18a5..3b8d8f5e 100644 --- a/controllers/deployment.go +++ b/controllers/deployment.go @@ -2,16 +2,18 @@ package controllers import ( "context" - templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" "reflect" "strconv" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context, _, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace) if err != nil { deploymentConfig.UseDBTLSCerts = false - log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found") + log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found") } else { deploymentConfig.UseDBTLSCerts = true log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls") @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan return nil } +// checkDeploymentReady verifies that a TrustyAI service deployment is ready func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) { deployment := &appsv1.Deployment{} @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in for _, cond := range deployment.Status.Conditions { if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas { + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.Namespace), + client.MatchingLabels(deployment.Spec.Selector.MatchLabels), + } + if err := r.List(ctx, podList, listOpts...); err != nil { + return false, err + } + + for _, pod := range podList.Items { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff { + return false, nil + } + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + return false, nil + } + } + } + return true, nil } } diff --git a/controllers/statuses.go b/controllers/statuses.go index b88ba47c..9236e81b 100644 --- a/controllers/statuses.go +++ b/controllers/statuses.go @@ -13,7 +13,8 @@ import ( // IsAllReady checks if all the necessary readiness fields are true for the specific mode func (rs *AvailabilityStatus) IsAllReady(mode string) bool { - return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE) + return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || + (rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE) } // AvailabilityStatus has the readiness status of various resources. @@ -22,6 +23,7 @@ type AvailabilityStatus struct { DeploymentReady bool RouteReady bool InferenceServiceReady bool + DBReady bool } func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService), @@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { // Check for PVC readiness status.PVCReady, err = r.checkPVCReady(ctx, instance) - if err != nil || !status.PVCReady { - // PVC not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready") - } } // Check for deployment readiness status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance) - if err != nil || !status.DeploymentReady { - // Deployment not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready") + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance) } // Check for route readiness status.RouteReady, err = r.checkRouteReady(ctx, instance) - if err != nil || !status.RouteReady { - // Route not ready, requeue - return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready") - } // Check if InferenceServices present status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace) @@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() { UpdatePVCAvailable(saved) } + UpdateRouteAvailable(saved) + + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBAvailable(saved) + } + UpdateTrustyAIServiceAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseReady saved.Status.Ready = v1.ConditionTrue }) if updateErr != nil { @@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta } } + if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() { + UpdateDBConnectionError(saved) + } + if status.RouteReady { UpdateRouteAvailable(saved) } else { UpdateRouteNotAvailable(saved) } + UpdateTrustyAIServiceNotAvailable(saved) - saved.Status.Phase = "Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse }) if updateErr != nil { @@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse) - saved.Status.Phase = "Not Ready" + saved.Status.Phase = PhaseNotReady saved.Status.Ready = v1.ConditionFalse } @@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse +} + +func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue) } diff --git a/controllers/statuses_test.go b/controllers/statuses_test.go index 9e395df4..f6ad7fe3 100644 --- a/controllers/statuses_test.go +++ b/controllers/statuses_test.go @@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust // Call the reconcileStatuses function _, _ = reconciler.reconcileStatuses(ctx, instance) - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true") @@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() { Namespace: instance.Namespace, }, instance) }, "failed to get updated instance") - - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") @@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() { }, instance) }, "failed to get updated instance") - readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true) + readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true) Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition") if readyCondition != nil { Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true") diff --git a/controllers/suite_test.go b/controllers/suite_test.go index e61f4ac4..a2f16ba8 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -20,11 +20,12 @@ import ( "context" "encoding/json" "fmt" - rbacv1 "k8s.io/api/rbac/v1" "path/filepath" "testing" "time" + rbacv1 "k8s.io/api/rbac/v1" + "github.com/google/uuid" kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" @@ -358,14 +359,75 @@ func makeDeploymentReady(ctx context.Context, k8sClient client.Client, instance Reason: "DeploymentReady", Message: "The deployment is ready", }, + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionTrue, + Reason: "NewReplicaSetAvailable", + Message: "ReplicaSet is progressing", + }, } if deployment.Spec.Replicas != nil { - deployment.Status.ReadyReplicas = 1 - deployment.Status.Replicas = 1 + deployment.Status.ReadyReplicas = *deployment.Spec.Replicas + deployment.Status.Replicas = *deployment.Spec.Replicas + deployment.Status.AvailableReplicas = *deployment.Spec.Replicas } - return k8sClient.Update(ctx, deployment) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: instance.Name + "-pod", + Namespace: instance.Namespace, + Labels: deployment.Spec.Selector.MatchLabels, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "trustyai-service", + Image: "quay.io/trustyai/trustyai-service:latest", + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + { + Type: corev1.ContainersReady, + Status: corev1.ConditionTrue, + }, + }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "trustyai-service", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + Ready: true, + RestartCount: 0, + }, + }, + }, + } + + if err := k8sClient.Create(ctx, pod); err != nil { + return err + } + + if err := k8sClient.Status().Update(ctx, deployment); err != nil { + return err + } + + return nil } func makeRouteReady(ctx context.Context, k8sClient client.Client, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index d37fb267..edb60f21 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -25,6 +25,7 @@ import ( kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -156,10 +157,28 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Get database configuration secret, err := r.findDatabaseSecret(ctx, instance) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsNotFound(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Service configured to use database storage but no database configuration found.") } err = r.validateDatabaseSecret(secret) if err != nil { + _, updateErr := r.updateStatus(ctx, instance, func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) { + UpdateDBCredentialsError(saved) + UpdateTrustyAIServiceNotAvailable(saved) + saved.Status.Phase = PhaseNotReady + saved.Status.Ready = v1.ConditionFalse + }) + if updateErr != nil { + return RequeueWithErrorMessage(ctx, err, "Failed to update status") + } return RequeueWithErrorMessage(ctx, err, "Database configuration contains errors.") } } From 97a5a5d30d6bf98ed1f8505c1027beff1634b059 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 10 Oct 2024 14:04:29 +0100 Subject: [PATCH 4/5] Add KServe destination rule for Inference Services in the ServiceMesh (#315) * Add DestinationRule creation for KServe serverless * Add permissions for destination rules * Add role for destination rules * Add missing role for creating destination rules * Fix spacing in DestinationRule template --- config/overlays/odh/params.env | 4 +- config/rbac/role.yaml | 12 ++++ controllers/destination_rule.go | 69 +++++++++++++++++++ controllers/inference_services.go | 12 +++- .../service/destination-rule.tmpl.yaml | 13 ++++ controllers/trustyaiservice_controller.go | 1 + 6 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 controllers/destination_rule.go create mode 100644 controllers/templates/service/destination-rule.tmpl.yaml diff --git a/config/overlays/odh/params.env b/config/overlays/odh/params.env index c67b2b5c..4a9c77e0 100644 --- a/config/overlays/odh/params.env +++ b/config/overlays/odh/params.env @@ -1,4 +1,4 @@ -trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0 -trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0 +trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest +trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0 kServeServerless=enabled diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b6b21e4b..2528617b 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -134,6 +134,18 @@ rules: - create - list - watch +- apiGroups: + - networking.istio.io + resources: + - destinationrules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - rbac.authorization.k8s.io resources: diff --git a/controllers/destination_rule.go b/controllers/destination_rule.go new file mode 100644 index 00000000..f683373e --- /dev/null +++ b/controllers/destination_rule.go @@ -0,0 +1,69 @@ +package controllers + +import ( + "context" + "fmt" + "reflect" + + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml" +) + +// DestinationRuleConfig has the variables for the DestinationRule template +type DestinationRuleConfig struct { + Name string + Namespace string + DestinationRuleName string +} + +func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + destinationRuleName := instance.Name + "-internal" + + existingDestinationRule := &unstructured.Unstructured{} + existingDestinationRule.SetKind("DestinationRule") + existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1") + + // Check if the DestinationRule already exists + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule) + if err == nil { + // DestinationRule exists + return nil + } + + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to check for existing DestinationRule: %v", err) + } + + destinationRuleConfig := DestinationRuleConfig{ + Name: instance.Name, + Namespace: instance.Namespace, + DestinationRuleName: destinationRuleName, + } + + var destinationRule *unstructured.Unstructured + destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{})) + if err != nil { + log.FromContext(ctx).Error(err, "could not parse the DestinationRule template") + return err + } + + if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil { + return err + } + + err = r.Create(ctx, destinationRule) + if err != nil { + return fmt.Errorf("failed to create DestinationRule: %v", err) + } + + return nil +} diff --git a/controllers/inference_services.go b/controllers/inference_services.go index b49a5267..a4cad7be 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -3,13 +3,14 @@ package controllers import ( "context" "fmt" + "strings" + kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "strings" ) const ( @@ -271,6 +272,15 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t infService.Spec.Predictor.Logger = &logger } + // Only if the Istio sidecar annotation is set + annotations := infService.GetAnnotations() + if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" { + err := r.ensureDestinationRule(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure DestinationRule: %v", err) + } + } + // Update the InferenceService err := r.Update(ctx, &infService) if err == nil { diff --git a/controllers/templates/service/destination-rule.tmpl.yaml b/controllers/templates/service/destination-rule.tmpl.yaml new file mode 100644 index 00000000..f62e548e --- /dev/null +++ b/controllers/templates/service/destination-rule.tmpl.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.istio.io/v1beta1 +kind: DestinationRule +metadata: + name: {{ .DestinationRuleName }} + namespace: {{ .Namespace }} +spec: + host: {{ .Name }}.{{ .Namespace }}.svc.cluster.local + trafficPolicy: + portLevelSettings: + - port: + number: 443 + tls: + mode: SIMPLE diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index edb60f21..e9208532 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -69,6 +69,7 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update +//+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. From d11408cf5a8f717e4d63112c21f2a7addfea62a0 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 10 Oct 2024 17:43:43 +0100 Subject: [PATCH 5/5] Add check if DestinationRule CRD is present before creating it (#316) * Add check for DestinationRule CRD * Add API extensions to operator's scheme * Add permission for CRD resource --- config/rbac/role.yaml | 8 ++++++++ controllers/destination_rule.go | 20 ++++++++++++++++++++ controllers/inference_services.go | 20 ++++++++++++++++++-- controllers/trustyaiservice_controller.go | 1 + go.mod | 2 +- main.go | 5 ++++- 6 files changed, 52 insertions(+), 4 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 2528617b..91be6698 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -60,6 +60,14 @@ rules: - list - update - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch - apiGroups: - apps resources: diff --git a/controllers/destination_rule.go b/controllers/destination_rule.go index f683373e..ce17552f 100644 --- a/controllers/destination_rule.go +++ b/controllers/destination_rule.go @@ -7,6 +7,7 @@ import ( trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" @@ -16,6 +17,7 @@ import ( const ( destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml" + destinationRuleCDRName = "destinationrules.networking.istio.io" ) // DestinationRuleConfig has the variables for the DestinationRule template @@ -25,7 +27,25 @@ type DestinationRuleConfig struct { DestinationRuleName string } +// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise +func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) { + crd := &apiextensionsv1.CustomResourceDefinition{} + + err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd) + if err != nil { + if !errors.IsNotFound(err) { + return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err) + } + // Not found + return false, nil + } + + // Found + return true, nil +} + func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error { + destinationRuleName := instance.Name + "-internal" existingDestinationRule := &unstructured.Unstructured{} diff --git a/controllers/inference_services.go b/controllers/inference_services.go index a4cad7be..ee8745bf 100644 --- a/controllers/inference_services.go +++ b/controllers/inference_services.go @@ -275,10 +275,26 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t // Only if the Istio sidecar annotation is set annotations := infService.GetAnnotations() if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" { - err := r.ensureDestinationRule(ctx, instance) + + // Check if DestinationRule CRD is present. If there's an error, don't proceed and return the error + exists, err := r.isDestinationRuleCRDPresent(ctx) if err != nil { - return fmt.Errorf("failed to ensure DestinationRule: %v", err) + log.FromContext(ctx).Error(err, "Error verifying DestinationRule CRD is present") + return err + } + + // Try to create the DestinationRule, since CRD exists + if exists { + err := r.ensureDestinationRule(ctx, instance) + if err != nil { + return fmt.Errorf("failed to ensure DestinationRule: %v", err) + } + } else { + // DestinationRule CRD does not exist. Do not attempt to create it and log error + err := fmt.Errorf("the DestinationRule CRD is not present in this cluster") + log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but DestinationRule CRD not found") } + } // Update the InferenceService diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index e9208532..1176d986 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -70,6 +70,7 @@ type TrustyAIServiceReconciler struct { //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update //+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=create;list;watch;get;update;patch;delete +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=list;watch;get // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/go.mod b/go.mod index 6e15a91f..fcb6b76f 100644 --- a/go.mod +++ b/go.mod @@ -88,7 +88,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.26.4 // indirect + k8s.io/apiextensions-apiserver v0.26.4 k8s.io/component-base v0.26.4 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230515203736-54b630e78af5 // indirect diff --git a/main.go b/main.go index cc7fc1eb..ce5ee19e 100644 --- a/main.go +++ b/main.go @@ -18,11 +18,13 @@ package main import ( "flag" + "os" + kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1" routev1 "github.com/openshift/api/route/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "os" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -52,6 +54,7 @@ func init() { utilruntime.Must(kservev1alpha1.AddToScheme(scheme)) utilruntime.Must(kservev1beta1.AddToScheme(scheme)) utilruntime.Must(routev1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme }