Skip to content

Commit

Permalink
Merge branch 'trustyai-explainability:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ruivieira authored Oct 10, 2024
2 parents 07f7237 + d11408c commit 50e5df9
Show file tree
Hide file tree
Showing 18 changed files with 430 additions and 39 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/build-and-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,12 @@ jobs:
📦 [PR image](https://quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/trustyai-service-operator-ci:${{ github.event.pull_request.head.sha }}`
🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }})
```
devFlags:
manifests:
- contextDir: config
sourcePath: ''
uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }}
```
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |

#### Database Status

| Status Type | Status Reason | Description |
|---------------|-------------------------|---------------------------------------------------|
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |


#### Status Behavior

- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.

## Contributing
Expand Down
2 changes: 1 addition & 1 deletion config/base/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=disabled
kServeServerless=disabled
6 changes: 3 additions & 3 deletions config/overlays/odh/params.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=enabled
kServeServerless=enabled
20 changes: 20 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ rules:
- list
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
Expand Down Expand Up @@ -134,6 +142,18 @@ rules:
- create
- list
- watch
- apiGroups:
- networking.istio.io
resources:
- destinationrules
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
Expand Down
15 changes: 15 additions & 0 deletions controllers/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (
StatusTypePVCAvailable = "PVCAvailable"
StatusTypeRouteAvailable = "RouteAvailable"
StatusTypeAvailable = "Available"
StatusTypeDBAvailable = "DBAvailable"
)

// Status reasons
Expand All @@ -58,6 +59,10 @@ const (
StatusReasonRouteFound = "RouteFound"
StatusAvailable = "AllComponentsReady"
StatusNotAvailable = "NotAllComponentsReady"
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
StatusDBCredentialsError = "DBCredentialsError"
StatusDBConnectionError = "DBConnectionError"
StatusDBAvailable = "DBAvailable"
)

// Event reasons
Expand All @@ -67,4 +72,14 @@ const (
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
)

const (
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// Phases
const (
PhaseReady = "Ready"
PhaseNotReady = "Not Ready"
)

const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"
63 changes: 63 additions & 0 deletions controllers/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package controllers

import (
"context"
"strings"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues.
func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
if err != nil {
if errors.IsNotFound(err) {
return false, nil
}
return false, err
}

for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "trustyai-service" {
if cs.State.Running != nil {
return true, nil
}

if cs.LastTerminationState.Terminated != nil {
termination := cs.LastTerminationState.Terminated
if termination.Reason == "Error" && termination.Message != "" {
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
return false, nil
}
}
}

if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
}
}
}
}
}

return false, nil
}
27 changes: 25 additions & 2 deletions controllers/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package controllers

import (
"context"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
"reflect"
"strconv"

templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand Down Expand Up @@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
Expand Down Expand Up @@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
return nil
}

// checkDeploymentReady verifies that a TrustyAI service deployment is ready
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}

Expand All @@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return false, nil
}
}
}

return true, nil
}
}
Expand Down
89 changes: 89 additions & 0 deletions controllers/destination_rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package controllers

import (
"context"
"fmt"
"reflect"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
)

const (
destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml"
destinationRuleCDRName = "destinationrules.networking.istio.io"
)

// DestinationRuleConfig has the variables for the DestinationRule template
type DestinationRuleConfig struct {
Name string
Namespace string
DestinationRuleName string
}

// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise
func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) {
crd := &apiextensionsv1.CustomResourceDefinition{}

err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd)
if err != nil {
if !errors.IsNotFound(err) {
return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err)
}
// Not found
return false, nil
}

// Found
return true, nil
}

func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error {

destinationRuleName := instance.Name + "-internal"

existingDestinationRule := &unstructured.Unstructured{}
existingDestinationRule.SetKind("DestinationRule")
existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1")

// Check if the DestinationRule already exists
err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule)
if err == nil {
// DestinationRule exists
return nil
}

if !errors.IsNotFound(err) {
return fmt.Errorf("failed to check for existing DestinationRule: %v", err)
}

destinationRuleConfig := DestinationRuleConfig{
Name: instance.Name,
Namespace: instance.Namespace,
DestinationRuleName: destinationRuleName,
}

var destinationRule *unstructured.Unstructured
destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{}))
if err != nil {
log.FromContext(ctx).Error(err, "could not parse the DestinationRule template")
return err
}

if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil {
return err
}

err = r.Create(ctx, destinationRule)
if err != nil {
return fmt.Errorf("failed to create DestinationRule: %v", err)
}

return nil
}
28 changes: 27 additions & 1 deletion controllers/inference_services.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ package controllers
import (
"context"
"fmt"
"strings"

kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1"
trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"strings"
)

const (
Expand Down Expand Up @@ -271,6 +272,31 @@ func (r *TrustyAIServiceReconciler) patchKServe(ctx context.Context, instance *t
infService.Spec.Predictor.Logger = &logger
}

// Only if the Istio sidecar annotation is set
annotations := infService.GetAnnotations()
if inject, exists := annotations["sidecar.istio.io/inject"]; exists && inject == "true" {

// Check if DestinationRule CRD is present. If there's an error, don't proceed and return the error
exists, err := r.isDestinationRuleCRDPresent(ctx)
if err != nil {
log.FromContext(ctx).Error(err, "Error verifying DestinationRule CRD is present")
return err
}

// Try to create the DestinationRule, since CRD exists
if exists {
err := r.ensureDestinationRule(ctx, instance)
if err != nil {
return fmt.Errorf("failed to ensure DestinationRule: %v", err)
}
} else {
// DestinationRule CRD does not exist. Do not attempt to create it and log error
err := fmt.Errorf("the DestinationRule CRD is not present in this cluster")
log.FromContext(ctx).Error(err, "InferenceService has service mesh annotation but DestinationRule CRD not found")
}

}

// Update the InferenceService
err := r.Update(ctx, &infService)
if err == nil {
Expand Down
Loading

0 comments on commit 50e5df9

Please sign in to comment.