From aad6dbec28d58e05dc5aa29caf4d116b7104155a Mon Sep 17 00:00:00 2001 From: Fabricio Aguiar Date: Wed, 30 Oct 2024 16:03:59 +0000 Subject: [PATCH] Fix ansible job error reason This change fixes a nil pointer dereference by ensuring the job failures exceed the defined BackoffLimit. Since our logic is written to determine if the BackoffLimit has been exceeded, there is no need to specifically check the condition.Reason that would tell us the same thing. We can simply infer from our check that the job has failed due to the BackoffLimit being reached. closes OSPRH-11068 Signed-off-by: Fabricio Aguiar Co-Authored-by: Brendan Shephard --- pkg/dataplane/deployment.go | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/dataplane/deployment.go b/pkg/dataplane/deployment.go index 1483a9b9e..91c37247f 100644 --- a/pkg/dataplane/deployment.go +++ b/pkg/dataplane/deployment.go @@ -195,11 +195,10 @@ func (d *Deployer) ConditionalDeploy( } - var ansibleCondition *batchv1.JobCondition if nsConditions.IsFalse(readyCondition) { - var ansibleEE *batchv1.Job + var ansibleJob *batchv1.Job _, labelSelector := dataplaneutil.GetAnsibleExecutionNameAndLabels(&foundService, d.Deployment.Name, d.NodeSet.Name) - ansibleEE, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector) + ansibleJob, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector) if err != nil { // Return nil if we don't have AnsibleEE available yet if k8s_errors.IsNotFound(err) { @@ -215,33 +214,36 @@ func (d *Deployer) ConditionalDeploy( err.Error())) } - if ansibleEE.Status.Succeeded > 0 { + if ansibleJob.Status.Succeeded > 0 { log.Info(fmt.Sprintf("Condition %s ready", readyCondition)) nsConditions.Set(condition.TrueCondition( readyCondition, readyMessage)) - } else if ansibleEE.Status.Active > 0 { - log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleEE.Name, ansibleEE.Status.Active)) + } else if ansibleJob.Status.Active > 0 { + log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleJob.Name, ansibleJob.Status.Active)) nsConditions.Set(condition.FalseCondition( readyCondition, condition.RequestedReason, condition.SeverityInfo, readyWaitingMessage)) - } else if ansibleEE.Status.Failed > 0 { - errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleEE.Name, ansibleEE.Namespace, ansibleEE.Status.Failed) - for _, condition := range ansibleEE.Status.Conditions { - if condition.Type == batchv1.JobFailed { - ansibleCondition = &condition + } else if ansibleJob.Status.Failed > 1 { + errorReason := condition.ErrorReason + errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleJob.Name, ansibleJob.Namespace, ansibleJob.Status.Failed) + + if ansibleJob.Status.Failed >= *ansibleJob.Spec.BackoffLimit { + for _, condition := range ansibleJob.Status.Conditions { + if condition.Type == batchv1.JobFailed { + errorReason = batchv1.JobReasonBackoffLimitExceeded + errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleJob.Name, ansibleJob.Namespace, condition.Message) + } } } - if ansibleCondition.Reason == condition.JobReasonBackoffLimitExceeded { - errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleEE.Name, ansibleEE.Namespace, ansibleCondition.Message) - } + log.Info(fmt.Sprintf("Condition %s error", readyCondition)) err = fmt.Errorf(errorMsg) nsConditions.Set(condition.FalseCondition( readyCondition, - condition.Reason(ansibleCondition.Reason), + condition.Reason(errorReason), condition.SeverityError, readyErrorMessage, err.Error()))