From 8b6a90d9f9cd68cb3b661d7cd0e89326a28f3377 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 09:59:41 -0700 Subject: [PATCH 01/13] adding logic to print failures and retry if there is an cloud-init error --- .../scripts/synchronize-repos.sh | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 8ea2c50dbca4..aedba103e184 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -99,12 +99,40 @@ synchronize_repos() { esac } +# Function to check cloud-init status and retry on failure # Before we start to modify repositories and install packages we'll wait for cloud-init to finish # so it doesn't race with any of our package installations. -# We run as sudo becase Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as +# We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). -sudo cloud-init status --wait +check_cloud_init() { + local max_retries=2 + local retry_count=0 + local exit_code + while [[ $retry_count -lt $max_retries ]]; do + if sudo cloud-init status --wait; then + echo "Cloud-init completed successfully" + return 0 + else + exit_code=$? + case $exit_code in + 1) + echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 + ;; + 2) + echo "Cloud-init completed successfully, but with errors. Exit code: $exit_code" 1>&2 + exit_code=0 + ;; + esac + echo "There were errors when executing cloud-init. Here are the logs for the failure:" + cat /var/log/cloud-init-* | grep "Failed" + retry_count=$((retry_count + 1)) + fi + done + return $exit_code +} + +check_cloud_init begin_time=$(date +%s) end_time=$((begin_time + TIMEOUT_SECONDS)) while [ "$(date +%s)" -lt "$end_time" ]; do @@ -116,3 +144,4 @@ while [ "$(date +%s)" -lt "$end_time" ]; do done fail "Timed out waiting for distro repos to be set up" + From 968fb8745c1a25c3c4aa14bbf73fa589b0d43107 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 10:22:18 -0700 Subject: [PATCH 02/13] adding logic to print failures and retry if there is an cloud-init error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index aedba103e184..ff97ab744b58 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -132,7 +132,12 @@ check_cloud_init() { return $exit_code } +# Checking cloud-init check_cloud_init +if [ $? -eq 1 ]; then + exit 1 +fi + begin_time=$(date +%s) end_time=$((begin_time + TIMEOUT_SECONDS)) while [ "$(date +%s)" -lt "$end_time" ]; do From b3fcaa530bb00dac296a985b6606ae48ca98d27b Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 10:44:12 -0700 Subject: [PATCH 03/13] fixing timeout error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index ff97ab744b58..32dbc021addb 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,7 +105,7 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). check_cloud_init() { - local max_retries=2 + local max_retries=1 local retry_count=0 local exit_code From 19b2c1361d5c300311992f36feffda22ff186847 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 11:09:46 -0700 Subject: [PATCH 04/13] fixing timeout error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 32dbc021addb..c5a594d458a7 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -133,6 +133,7 @@ check_cloud_init() { } # Checking cloud-init +echo $? check_cloud_init if [ $? -eq 1 ]; then exit 1 @@ -140,7 +141,9 @@ fi begin_time=$(date +%s) end_time=$((begin_time + TIMEOUT_SECONDS)) +echo "--begin---${begin_time}-----end--${end_time}-----$?" while [ "$(date +%s)" -lt "$end_time" ]; do + echo "in while loop------" if synchronize_repos; then exit 0 fi From 9f962fc0dc35bbaf6807e030476855165b0a1811 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 12:06:16 -0700 Subject: [PATCH 05/13] fixing timeout error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index c5a594d458a7..ec420b292bde 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,7 +105,7 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). check_cloud_init() { - local max_retries=1 + local max_retries=0 local retry_count=0 local exit_code @@ -133,7 +133,6 @@ check_cloud_init() { } # Checking cloud-init -echo $? check_cloud_init if [ $? -eq 1 ]; then exit 1 @@ -141,9 +140,7 @@ fi begin_time=$(date +%s) end_time=$((begin_time + TIMEOUT_SECONDS)) -echo "--begin---${begin_time}-----end--${end_time}-----$?" while [ "$(date +%s)" -lt "$end_time" ]; do - echo "in while loop------" if synchronize_repos; then exit 0 fi From 51f455991bc44b5f877b36c7f08f4f920cb2c3d8 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 12:29:03 -0700 Subject: [PATCH 06/13] fixing timeout error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index ec420b292bde..32dbc021addb 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,7 +105,7 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). check_cloud_init() { - local max_retries=0 + local max_retries=1 local retry_count=0 local exit_code From aa03c9515388affe294a3981d8244efc70a6bec9 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Fri, 4 Oct 2024 14:40:11 -0700 Subject: [PATCH 07/13] fixing timeout error --- enos/modules/install_packages/scripts/synchronize-repos.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 32dbc021addb..8e2b4ca10be5 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -149,4 +149,3 @@ while [ "$(date +%s)" -lt "$end_time" ]; do done fail "Timed out waiting for distro repos to be set up" - From 18c7a085800158619def0f575979d71c1c0d7bca Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Mon, 7 Oct 2024 08:59:50 -0700 Subject: [PATCH 08/13] updating retry to 2 --- enos/modules/install_packages/scripts/synchronize-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 8e2b4ca10be5..83747dbdbe4d 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,7 +105,7 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). check_cloud_init() { - local max_retries=1 + local max_retries=2 local retry_count=0 local exit_code From 08021c7b61bc3d496715ef1f26fd75b216509c2b Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Tue, 8 Oct 2024 09:29:26 -0700 Subject: [PATCH 09/13] updating cloud init status logic --- .../scripts/synchronize-repos.sh | 37 +++++-------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 83747dbdbe4d..4fca75495294 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,39 +105,20 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). check_cloud_init() { - local max_retries=2 - local retry_count=0 - local exit_code - - while [[ $retry_count -lt $max_retries ]]; do - if sudo cloud-init status --wait; then - echo "Cloud-init completed successfully" - return 0 - else - exit_code=$? - case $exit_code in - 1) - echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 - ;; - 2) - echo "Cloud-init completed successfully, but with errors. Exit code: $exit_code" 1>&2 - exit_code=0 - ;; - esac - echo "There were errors when executing cloud-init. Here are the logs for the failure:" - cat /var/log/cloud-init-* | grep "Failed" - retry_count=$((retry_count + 1)) - fi - done - return $exit_code + sudo cloud-init status --wait + exit_code=$? + if [ "$?" -ne 0 ] && [ "$?" -ne 2 ]; then + echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 + echo "Here are the logs for the failure:" + cat /var/log/cloud-init-* | grep "Failed" + exit 1 + fi } # Checking cloud-init check_cloud_init -if [ $? -eq 1 ]; then - exit 1 -fi +# Synchronizing repos begin_time=$(date +%s) end_time=$((begin_time + TIMEOUT_SECONDS)) while [ "$(date +%s)" -lt "$end_time" ]; do From c93b2490ff68a735eee8f4f68decdfea521f90f6 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Tue, 8 Oct 2024 11:31:58 -0700 Subject: [PATCH 10/13] updating cloud init status logic --- enos/modules/install_packages/scripts/synchronize-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 4fca75495294..50328123b8b9 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -107,7 +107,7 @@ synchronize_repos() { check_cloud_init() { sudo cloud-init status --wait exit_code=$? - if [ "$?" -ne 0 ] && [ "$?" -ne 2 ]; then + if [ "$exit_code" -ne 0 ] && [ "$exit_code" -ne 2 ]; then echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 echo "Here are the logs for the failure:" cat /var/log/cloud-init-* | grep "Failed" From 03e5244f8f8b79361a152a938cd8652f25263d72 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Wed, 20 Nov 2024 12:04:52 -0800 Subject: [PATCH 11/13] addressing comments --- .../install_packages/scripts/synchronize-repos.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 50328123b8b9..de9c5315cc17 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -104,19 +104,19 @@ synchronize_repos() { # so it doesn't race with any of our package installations. # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). -check_cloud_init() { +wait_for_cloud_init() { sudo cloud-init status --wait exit_code=$? + # We are not failing exit status 2 because cloud-init is up to date if [ "$exit_code" -ne 0 ] && [ "$exit_code" -ne 2 ]; then echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 echo "Here are the logs for the failure:" - cat /var/log/cloud-init-* | grep "Failed" - exit 1 + cat /var/log/cloud-init-* fi } -# Checking cloud-init -check_cloud_init +# Wait for cloud-init +wait_for_cloud_init # Synchronizing repos begin_time=$(date +%s) From 36758dcbbd4cee66826866ced6adcaa83939994a Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Thu, 21 Nov 2024 10:30:03 -0800 Subject: [PATCH 12/13] addressing comments --- .../scripts/synchronize-repos.sh | 35 ++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index de9c5315cc17..71d5706eb9c5 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -105,14 +105,33 @@ synchronize_repos() { # We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as # non-root user (known bug). wait_for_cloud_init() { - sudo cloud-init status --wait - exit_code=$? - # We are not failing exit status 2 because cloud-init is up to date - if [ "$exit_code" -ne 0 ] && [ "$exit_code" -ne 2 ]; then - echo "cloud-init did not complete successfully. Exit code: $exit_code" 1>&2 - echo "Here are the logs for the failure:" - cat /var/log/cloud-init-* - fi + output=$(sudo cloud-init status --wait) + res=$? + case $res in + 0) + return 0 + ;; + 2) + { + echo "WARNING: cloud-init did not complete successfully but recovered." + echo "Exit code: $res" + echo "Output: $output" + echo "Here are the logs for the failure:" + cat /var/log/cloud-init-* + } 1>&2 + return 0 + ;; + *) + { + echo "cloud-init did not complete successfully." + echo "Exit code: $res" + echo "Output: $output" + echo "Here are the logs for the failure:" + cat /var/log/cloud-init-* + } 1>&2 + return 1 + ;; + esac } # Wait for cloud-init From 20111e43e4a88eff133ddef83207ac7ee34460f4 Mon Sep 17 00:00:00 2001 From: "tin.vo" Date: Thu, 21 Nov 2024 10:40:08 -0800 Subject: [PATCH 13/13] fixing error from sync scriot --- enos/modules/install_packages/scripts/synchronize-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/modules/install_packages/scripts/synchronize-repos.sh b/enos/modules/install_packages/scripts/synchronize-repos.sh index 71d5706eb9c5..eac37258fc70 100644 --- a/enos/modules/install_packages/scripts/synchronize-repos.sh +++ b/enos/modules/install_packages/scripts/synchronize-repos.sh @@ -130,7 +130,7 @@ wait_for_cloud_init() { cat /var/log/cloud-init-* } 1>&2 return 1 - ;; + ;; esac }