From 911049824fad79ce7e3fc04b743bc0504fcbaf68 Mon Sep 17 00:00:00 2001 From: Ioannis Sfakianakis Date: Wed, 6 Nov 2024 17:30:06 +0200 Subject: [PATCH] Allow download retrying if the computed and configured checksums differ. - We have identified that downloading images under poor networking conditions from datastores might fail silently due to the SDK libraries we use for for Azure and AWS. This happens when we are installing new applications. - Another observation we made is that when we manually try re-installing the applications, they eventually succeed in the installation. - With this patch, we automate retrying the image download a fixed number of times (5 by default) if the computed versus the configure checksums differ. Signed-off-by: Ioannis Sfakianakis --- docs/CONFIG-PROPERTIES.md | 1 + pkg/pillar/cmd/downloader/downloader.go | 13 +++++++++++-- pkg/pillar/cmd/volumemgr/handledownloader.go | 19 +++++++++++++++++++ pkg/pillar/cmd/volumemgr/updatestatus.go | 16 ++++++++++++++++ pkg/pillar/cmd/volumemgr/volumemgr.go | 9 ++++++++- pkg/pillar/types/blob.go | 1 + pkg/pillar/types/downloadertypes.go | 1 + pkg/pillar/types/global.go | 5 +++++ pkg/pillar/types/global_test.go | 1 + 9 files changed, 63 insertions(+), 3 deletions(-) diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index 3c537516bd..f6c3fa7047 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -29,6 +29,7 @@ | timer.port.testbetterinterval | timer in seconds | 600 | test a higher prio port config | | network.fallback.any.eth | "enabled" or "disabled" | disabled (enabled forcefully during onboarding if no network config) | if no connectivity try any Ethernet, WiFi, or LTE with DHCP client | | network.download.max.cost | 0-255 | 0 | [max port cost for download](DEVICE-CONNECTIVITY.md) to avoid e.g., LTE ports | +| blob.download.max.retries | 1-10 | 5 | max download retries when image verification fails.| | debug.enable.usb | boolean | false | allow USB e.g. keyboards on device | | debug.enable.vga | boolean | false | allow VGA console on device | | debug.enable.ssh | authorized ssh key | empty string(ssh disabled) | allow ssh to EVE | diff --git a/pkg/pillar/cmd/downloader/downloader.go b/pkg/pillar/cmd/downloader/downloader.go index 48c16395df..b253d17a20 100644 --- a/pkg/pillar/cmd/downloader/downloader.go +++ b/pkg/pillar/cmd/downloader/downloader.go @@ -398,9 +398,18 @@ func handleModify(ctx *downloaderContext, key string, status.Expired, status.Name) // If RefCount from zero to non-zero and status has error - // or status is not downloaded then do install - if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) { + // or status is not downloaded or retry then do install + if config.RefCount != 0 && (status.HasError() || + status.State != types.DOWNLOADED || config.RetryCount != 0) { log.Functionf("handleModify installing %s", config.Name) + if config.RetryCount != 0 { + log.Functionf("handleModify retry download %s", config.Name) + status.CurrentSize = 0 + status.Size = 0 + status.Progress = 0 + status.State = types.DOWNLOADING + publishDownloaderStatus(ctx, status) + } handleCreate(ctx, config, status, key, receiveChan) } else if status.RefCount != config.RefCount { log.Functionf("handleModify RefCount change %s from %d to %d", diff --git a/pkg/pillar/cmd/volumemgr/handledownloader.go b/pkg/pillar/cmd/volumemgr/handledownloader.go index a51074d091..941f002ad5 100644 --- a/pkg/pillar/cmd/volumemgr/handledownloader.go +++ b/pkg/pillar/cmd/volumemgr/handledownloader.go @@ -64,6 +64,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus) Size: size, Target: locFilename, RefCount: refCount, + RetryCount: 0, } log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n) publishDownloaderConfig(ctx, &n) @@ -105,6 +106,24 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) { log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha) } +func retryDownload(ctx *volumemgrContext, imageSha string) { + m := lookupDownloaderConfig(ctx, imageSha) + if m == nil { + log.Functionf("retryDownload: config missing for %s", + imageSha) + return + } + if m.RefCount == 0 { + log.Fatalf("retryDownload: Attempting to retry when "+ + "RefCount is 0. Image Details - Name: %s, ImageSha: %s, ", + m.Name, m.ImageSha256) + } + m.RetryCount++ + + publishDownloaderConfig(ctx, m) + log.Functionf("retryDownload done for %s", imageSha) +} + func publishDownloaderConfig(ctx *volumemgrContext, config *types.DownloaderConfig) { diff --git a/pkg/pillar/cmd/volumemgr/updatestatus.go b/pkg/pillar/cmd/volumemgr/updatestatus.go index f2def1f7d8..0bab9c49ad 100644 --- a/pkg/pillar/cmd/volumemgr/updatestatus.go +++ b/pkg/pillar/cmd/volumemgr/updatestatus.go @@ -227,6 +227,22 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus) blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob}) leftToProcess = true + if blob.IsErrorSource(types.VerifyImageStatus{}) && blob.RetryCount < blobDownloadMaxRetries { + + // Remove VerifyImage config and retry download + MaybeRemoveVerifyImageConfig(ctx, blobSha) + retryDownload(ctx, blobSha) + + // Increment retry count + blob.RetryCount++ + blob.State = types.DOWNLOADING + // Remove VerifyImageConfig reference + blob.HasVerifierRef = false + blob.ClearErrorWithSource() + + log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries) + publishBlobStatus(ctx, blob) + } } } diff --git a/pkg/pillar/cmd/volumemgr/volumemgr.go b/pkg/pillar/cmd/volumemgr/volumemgr.go index fa1aeea83d..2894e36f41 100644 --- a/pkg/pillar/cmd/volumemgr/volumemgr.go +++ b/pkg/pillar/cmd/volumemgr/volumemgr.go @@ -42,7 +42,10 @@ const ( blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable ) -var volumeFormat = make(map[string]zconfig.Format) +var ( + blobDownloadMaxRetries uint32 = 5 // Unless from GlobalConfig + volumeFormat = make(map[string]zconfig.Format) +) type volumemgrContext struct { agentbase.AgentBase @@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName, ctx.CLIParams().DebugOverride, logger) if gcp != nil { + // Set max retries for blob download from global config + if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 { + blobDownloadMaxRetries = gcp.GlobalValueInt(types.BlobDownloadMaxRetries) + } maybeUpdateConfigItems(ctx, gcp) ctx.globalConfig = gcp ctx.GCInitialized = true diff --git a/pkg/pillar/types/blob.go b/pkg/pillar/types/blob.go index f0ed8adceb..1475cfbbbf 100644 --- a/pkg/pillar/types/blob.go +++ b/pkg/pillar/types/blob.go @@ -47,6 +47,7 @@ type BlobStatus struct { Progress uint // ErrorAndTimeWithSource provide common error handling capabilities ErrorAndTimeWithSource + RetryCount uint32 } const ( diff --git a/pkg/pillar/types/downloadertypes.go b/pkg/pillar/types/downloadertypes.go index c9ae610425..99a46ed3b9 100644 --- a/pkg/pillar/types/downloadertypes.go +++ b/pkg/pillar/types/downloadertypes.go @@ -23,6 +23,7 @@ type DownloaderConfig struct { Size uint64 // In bytes FinalObjDir string // final Object Store RefCount uint + RetryCount uint } func (config DownloaderConfig) Key() string { diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 0c46553337..3190f9c212 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -220,6 +220,10 @@ const ( // ports for image downloads. DownloadMaxPortCost GlobalSettingKey = "network.download.max.cost" + // BlobDownloadMaxRetries global setting key + // how many times EVE will retry to download a blob if its checksum is not verified + BlobDownloadMaxRetries GlobalSettingKey = "blob.download.max.retries" + // Bool Items // UsbAccess global setting key UsbAccess GlobalSettingKey = "debug.enable.usb" @@ -959,6 +963,7 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { // LogRemainToSendMBytes - Default is 2 Gbytes, minimum is 10 Mbytes configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF) configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255) + configItemSpecMap.AddIntItem(BlobDownloadMaxRetries, 5, 1, 0xFFFFFFFF) // Goroutine Leak Detection section configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF) diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index 022c0a25d7..21c45b91d5 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -177,6 +177,7 @@ func TestNewConfigItemSpecMap(t *testing.T) { ForceFallbackCounter, LogRemainToSendMBytes, DownloadMaxPortCost, + BlobDownloadMaxRetries, // Bool Items UsbAccess, VgaAccess,