From aad6dbec28d58e05dc5aa29caf4d116b7104155a Mon Sep 17 00:00:00 2001
From: Fabricio Aguiar <fabricio.aguiar@gmail.com>
Date: Wed, 30 Oct 2024 16:03:59 +0000
Subject: [PATCH] Fix ansible job error reason

This change fixes a nil pointer dereference by ensuring the job failures
exceed the defined BackoffLimit.

Since our logic is written to determine if the BackoffLimit has been
exceeded, there is no need to specifically check the condition.Reason
that would tell us the same thing. We can simply infer from our check
that the job has failed due to the BackoffLimit being reached.

closes OSPRH-11068

Signed-off-by: Fabricio Aguiar <fabricio.aguiar@gmail.com>
Co-Authored-by: Brendan Shephard <bshephar@redhat.com>
---
 pkg/dataplane/deployment.go | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/pkg/dataplane/deployment.go b/pkg/dataplane/deployment.go
index 1483a9b9e..91c37247f 100644
--- a/pkg/dataplane/deployment.go
+++ b/pkg/dataplane/deployment.go
@@ -195,11 +195,10 @@ func (d *Deployer) ConditionalDeploy(
 
 	}
 
-	var ansibleCondition *batchv1.JobCondition
 	if nsConditions.IsFalse(readyCondition) {
-		var ansibleEE *batchv1.Job
+		var ansibleJob *batchv1.Job
 		_, labelSelector := dataplaneutil.GetAnsibleExecutionNameAndLabels(&foundService, d.Deployment.Name, d.NodeSet.Name)
-		ansibleEE, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector)
+		ansibleJob, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector)
 		if err != nil {
 			// Return nil if we don't have AnsibleEE available yet
 			if k8s_errors.IsNotFound(err) {
@@ -215,33 +214,36 @@ func (d *Deployer) ConditionalDeploy(
 				err.Error()))
 		}
 
-		if ansibleEE.Status.Succeeded > 0 {
+		if ansibleJob.Status.Succeeded > 0 {
 			log.Info(fmt.Sprintf("Condition %s ready", readyCondition))
 			nsConditions.Set(condition.TrueCondition(
 				readyCondition,
 				readyMessage))
-		} else if ansibleEE.Status.Active > 0 {
-			log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleEE.Name, ansibleEE.Status.Active))
+		} else if ansibleJob.Status.Active > 0 {
+			log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleJob.Name, ansibleJob.Status.Active))
 			nsConditions.Set(condition.FalseCondition(
 				readyCondition,
 				condition.RequestedReason,
 				condition.SeverityInfo,
 				readyWaitingMessage))
-		} else if ansibleEE.Status.Failed > 0 {
-			errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleEE.Name, ansibleEE.Namespace, ansibleEE.Status.Failed)
-			for _, condition := range ansibleEE.Status.Conditions {
-				if condition.Type == batchv1.JobFailed {
-					ansibleCondition = &condition
+		} else if ansibleJob.Status.Failed > 1 {
+			errorReason := condition.ErrorReason
+			errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleJob.Name, ansibleJob.Namespace, ansibleJob.Status.Failed)
+
+			if ansibleJob.Status.Failed >= *ansibleJob.Spec.BackoffLimit {
+				for _, condition := range ansibleJob.Status.Conditions {
+					if condition.Type == batchv1.JobFailed {
+						errorReason = batchv1.JobReasonBackoffLimitExceeded
+						errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleJob.Name, ansibleJob.Namespace, condition.Message)
+					}
 				}
 			}
-			if ansibleCondition.Reason == condition.JobReasonBackoffLimitExceeded {
-				errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleEE.Name, ansibleEE.Namespace, ansibleCondition.Message)
-			}
+
 			log.Info(fmt.Sprintf("Condition %s error", readyCondition))
 			err = fmt.Errorf(errorMsg)
 			nsConditions.Set(condition.FalseCondition(
 				readyCondition,
-				condition.Reason(ansibleCondition.Reason),
+				condition.Reason(errorReason),
 				condition.SeverityError,
 				readyErrorMessage,
 				err.Error()))