Skip to content

Commit

Permalink
Allow download retrying if the computed and configured checksums differ.
Browse files Browse the repository at this point in the history
- We have identified that downloading images under poor networking conditions
from datastores might fail silently due to the SDK libraries we use for
for Azure and AWS. This happens when we are installing new applications.
- Another observation we made is that when we manually try re-installing
the applications, they eventually succeed in the installation.
- With this patch, we automate retrying the image download a fixed number
of times (5 by default) if the computed versus the configure checksums differ.

Signed-off-by: Ioannis Sfakianakis <[email protected]>
  • Loading branch information
jsfakian committed Dec 3, 2024
1 parent 0b67531 commit c8cf95f
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 3 deletions.
13 changes: 11 additions & 2 deletions pkg/pillar/cmd/downloader/downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,18 @@ func handleModify(ctx *downloaderContext, key string,
status.Expired, status.Name)

// If RefCount from zero to non-zero and status has error
// or status is not downloaded then do install
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) {
// or status is not downloaded or retry then do install
if config.RefCount != 0 && (status.HasError() ||
status.State != types.DOWNLOADED || config.DoRetry) {
log.Functionf("handleModify installing %s", config.Name)
if config.DoRetry {
log.Functionf("handleModify retry download %s", config.Name)
status.CurrentSize = 0
status.Size = 0
status.Progress = 0
status.State = types.DOWNLOADING
publishDownloaderStatus(ctx, status)
}
handleCreate(ctx, config, status, key, receiveChan)
} else if status.RefCount != config.RefCount {
log.Functionf("handleModify RefCount change %s from %d to %d",
Expand Down
19 changes: 19 additions & 0 deletions pkg/pillar/cmd/volumemgr/handledownloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
Size: size,
Target: locFilename,
RefCount: refCount,
DoRetry: false,
}
log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n)
publishDownloaderConfig(ctx, &n)
Expand Down Expand Up @@ -105,6 +106,24 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha)
}

func retryDownload(ctx *volumemgrContext, imageSha string) {
m := lookupDownloaderConfig(ctx, imageSha)
if m == nil {
log.Functionf("retryDownload: config missing for %s",
imageSha)
return
}
if m.RefCount == 0 {
log.Fatalf("retryDownload: Attempting to retry when "+
"RefCount is 0. Image Details - Name: %s, ImageSha: %s, ",
m.Name, m.ImageSha256)
}
m.DoRetry = true

publishDownloaderConfig(ctx, m)
log.Functionf("retryDownload done for %s", imageSha)
}

func publishDownloaderConfig(ctx *volumemgrContext,
config *types.DownloaderConfig) {

Expand Down
16 changes: 16 additions & 0 deletions pkg/pillar/cmd/volumemgr/updatestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,22 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob})

leftToProcess = true
if blob.RetryCount < blobDownloadMaxRetries {

// Remove VerifyImage config and retry download
MaybeRemoveVerifyImageConfig(ctx, blobSha)
retryDownload(ctx, blobSha)

// Increment retry count
blob.RetryCount++
blob.State = types.DOWNLOADING
// Remove VerifyImageConfig reference
blob.HasVerifierRef = false
blob.ClearErrorWithSource()

log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries)
publishBlobStatus(ctx, blob)
}
}
}

Expand Down
9 changes: 8 additions & 1 deletion pkg/pillar/cmd/volumemgr/volumemgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ const (
blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable
)

var volumeFormat = make(map[string]zconfig.Format)
var (
blobDownloadMaxRetries = 5 // Unless from GlobalConfig
volumeFormat = make(map[string]zconfig.Format)
)

type volumemgrContext struct {
agentbase.AgentBase
Expand Down Expand Up @@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string,
gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName,
ctx.CLIParams().DebugOverride, logger)
if gcp != nil {
// Set max retries for blob download from global config
if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 {
blobDownloadMaxRetries = int(gcp.GlobalValueInt(types.BlobDownloadMaxRetries))
}
maybeUpdateConfigItems(ctx, gcp)
ctx.globalConfig = gcp
ctx.GCInitialized = true
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type BlobStatus struct {
Progress uint
// ErrorAndTimeWithSource provide common error handling capabilities
ErrorAndTimeWithSource
RetryCount int
}

const (
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/downloadertypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type DownloaderConfig struct {
Size uint64 // In bytes
FinalObjDir string // final Object Store
RefCount uint
DoRetry bool
}

func (config DownloaderConfig) Key() string {
Expand Down
5 changes: 5 additions & 0 deletions pkg/pillar/types/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ const (
// ports for image downloads.
DownloadMaxPortCost GlobalSettingKey = "network.download.max.cost"

// BlobDownloadMaxRetries global setting key
// how many times EVE will retry to download a blob if its checksum is not verified
BlobDownloadMaxRetries GlobalSettingKey = "blob.download.max.retries"

// Bool Items
// UsbAccess global setting key
UsbAccess GlobalSettingKey = "debug.enable.usb"
Expand Down Expand Up @@ -948,6 +952,7 @@ func NewConfigItemSpecMap() ConfigItemSpecMap {
// LogRemainToSendMBytes - Default is 2 Gbytes, minimum is 10 Mbytes
configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF)
configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255)
configItemSpecMap.AddIntItem(BlobDownloadMaxRetries, 5, 1, 10)

// Goroutine Leak Detection section
configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF)
Expand Down

0 comments on commit c8cf95f

Please sign in to comment.