Skip to content

Commit

Permalink
Fix ansible job error reason
Browse files Browse the repository at this point in the history
This change fixes a nil pointer dereference by ensuring the job failures
exceed the defined BackoffLimit.

Since our logic is written to determine if the BackoffLimit has been
exceeded, there is no need to specifically check the condition.Reason
that would tell us the same thing. We can simply infer from our check
that the job has failed due to the BackoffLimit being reached.

closes OSPRH-11068

Signed-off-by: Fabricio Aguiar <[email protected]>
Co-Authored-by: Brendan Shephard <[email protected]>
  • Loading branch information
fao89 and bshephar committed Oct 31, 2024
1 parent 1276a85 commit aad6dbe
Showing 1 changed file with 17 additions and 15 deletions.
32 changes: 17 additions & 15 deletions pkg/dataplane/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,10 @@ func (d *Deployer) ConditionalDeploy(

}

var ansibleCondition *batchv1.JobCondition
if nsConditions.IsFalse(readyCondition) {
var ansibleEE *batchv1.Job
var ansibleJob *batchv1.Job
_, labelSelector := dataplaneutil.GetAnsibleExecutionNameAndLabels(&foundService, d.Deployment.Name, d.NodeSet.Name)
ansibleEE, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector)
ansibleJob, err = dataplaneutil.GetAnsibleExecution(d.Ctx, d.Helper, d.Deployment, labelSelector)
if err != nil {
// Return nil if we don't have AnsibleEE available yet
if k8s_errors.IsNotFound(err) {
Expand All @@ -215,33 +214,36 @@ func (d *Deployer) ConditionalDeploy(
err.Error()))
}

if ansibleEE.Status.Succeeded > 0 {
if ansibleJob.Status.Succeeded > 0 {
log.Info(fmt.Sprintf("Condition %s ready", readyCondition))
nsConditions.Set(condition.TrueCondition(
readyCondition,
readyMessage))
} else if ansibleEE.Status.Active > 0 {
log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleEE.Name, ansibleEE.Status.Active))
} else if ansibleJob.Status.Active > 0 {
log.Info(fmt.Sprintf("AnsibleEE job is not yet completed: Execution: %s, Active pods: %d", ansibleJob.Name, ansibleJob.Status.Active))
nsConditions.Set(condition.FalseCondition(
readyCondition,
condition.RequestedReason,
condition.SeverityInfo,
readyWaitingMessage))
} else if ansibleEE.Status.Failed > 0 {
errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleEE.Name, ansibleEE.Namespace, ansibleEE.Status.Failed)
for _, condition := range ansibleEE.Status.Conditions {
if condition.Type == batchv1.JobFailed {
ansibleCondition = &condition
} else if ansibleJob.Status.Failed > 1 {
errorReason := condition.ErrorReason
errorMsg := fmt.Sprintf("execution.name %s execution.namespace %s failed pods: %d", ansibleJob.Name, ansibleJob.Namespace, ansibleJob.Status.Failed)

if ansibleJob.Status.Failed >= *ansibleJob.Spec.BackoffLimit {
for _, condition := range ansibleJob.Status.Conditions {
if condition.Type == batchv1.JobFailed {
errorReason = batchv1.JobReasonBackoffLimitExceeded
errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleJob.Name, ansibleJob.Namespace, condition.Message)
}
}
}
if ansibleCondition.Reason == condition.JobReasonBackoffLimitExceeded {
errorMsg = fmt.Sprintf("backoff limit reached for execution.name %s execution.namespace %s execution.condition.message: %s", ansibleEE.Name, ansibleEE.Namespace, ansibleCondition.Message)
}

log.Info(fmt.Sprintf("Condition %s error", readyCondition))
err = fmt.Errorf(errorMsg)
nsConditions.Set(condition.FalseCondition(
readyCondition,
condition.Reason(ansibleCondition.Reason),
condition.Reason(errorReason),
condition.SeverityError,
readyErrorMessage,
err.Error()))
Expand Down

0 comments on commit aad6dbe

Please sign in to comment.