Skip to content

Commit

Permalink
Merge pull request #92 from Azure/bankiel-zypper-retries
Browse files Browse the repository at this point in the history
Add retries for zypper package manager repo_refresh
  • Loading branch information
benank authored Sep 21, 2021
2 parents 24e61ac + 8db03d4 commit e875112
Show file tree
Hide file tree
Showing 15 changed files with 123 additions and 26 deletions.
3 changes: 2 additions & 1 deletion src/core/src/bootstrap/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,11 @@ class AutoAssessmentStates(EnumBackport):

UNKNOWN_PACKAGE_SIZE = "Unknown"
PACKAGE_STATUS_REFRESH_RATE_IN_SECONDS = 10
MAX_FILE_OPERATION_RETRY_COUNT = 5
MAX_FILE_OPERATION_RETRY_COUNT = 10
MAX_ASSESSMENT_RETRY_COUNT = 5
MAX_INSTALLATION_RETRY_COUNT = 3
MAX_IMDS_CONNECTION_RETRY_COUNT = 5
MAX_ZYPPER_REPO_REFRESH_RETRY_COUNT = 5

class PackageClassification(EnumBackport):
UNCLASSIFIED = 'Unclassified'
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/bootstrap/EnvLayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def open(self, file_path, mode):
try:
return open(real_path, mode)
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to open {0} (retries exhausted). Error: {1}.".format(str(real_path), repr(error)))
Expand Down Expand Up @@ -310,7 +310,7 @@ def read_with_retry(self, file_path_or_handle):
self.__write_record(operation, code=0, output=value, delay=0)
return value
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to read from {0} (retries exhausted). Error: {1}.".format(str(file_path_or_handle), repr(error)))
Expand All @@ -327,7 +327,7 @@ def write_with_retry(self, file_path_or_handle, data, mode='a+'):
file_handle.write(str(data))
break
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to write to {0} (retries exhausted). Error: {1}.".format(str(file_handle.name), repr(error)))
Expand All @@ -346,7 +346,7 @@ def write_with_retry_using_temp_file(file_path, data, mode='w'):
shutil.move(tempname, file_path)
break
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to write to {0} (retries exhausted). Error: {1}.".format(str(file_path), repr(error)))
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/core_logic/PatchAssessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def start_assessment(self):
self.status_handler.set_assessment_substatus_json(status=Constants.STATUS_SUCCESS)
break
except Exception as error:
if i < Constants.MAX_ASSESSMENT_RETRY_COUNT:
if i < Constants.MAX_ASSESSMENT_RETRY_COUNT - 1:
error_msg = 'Retryable error retrieving available patches: ' + repr(error)
self.composite_logger.log_warning(error_msg)
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
Expand Down
4 changes: 4 additions & 0 deletions src/core/src/core_logic/RebootManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ def start_reboot_if_required_and_time_available(self, current_time_available):
""" Starts a reboot if required. Happens only at the end of the run if required. """
self.composite_logger.log("\nReboot Management")
reboot_pending = False if not self.status_handler else self.status_handler.is_reboot_pending
reboot_pending = self.package_manager.force_reboot or reboot_pending

if self.package_manager.force_reboot:
self.composite_logger.log("Reboot is occurring to mitigate an issue with the package manager.")

# return if never
if self.reboot_setting == Constants.REBOOT_NEVER:
Expand Down
1 change: 1 addition & 0 deletions src/core/src/package_managers/PackageManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ
self.single_package_upgrade_cmd = ''
self.single_package_upgrade_simulation_cmd = 'simulate-install'
self.package_manager_settings = {}
self.force_reboot = False

# Enabling caching for high performance retrieval (only for code explicitly requesting it)
self.all_updates_cached = []
Expand Down
27 changes: 26 additions & 1 deletion src/core/src/package_managers/ZypperPackageManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

"""ZypperPackageManager for SUSE"""
import re
import time
from core.src.package_managers.PackageManager import PackageManager
from core.src.bootstrap.Constants import Constants

Expand Down Expand Up @@ -54,7 +55,31 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ
def refresh_repo(self):
self.composite_logger.log("Refreshing local repo...")
# self.invoke_package_manager(self.repo_clean) # purges local metadata for rebuild - addresses a possible customer environment error
self.invoke_package_manager(self.repo_refresh)
for i in range(0, Constants.MAX_ZYPPER_REPO_REFRESH_RETRY_COUNT):
try:
self.invoke_package_manager(self.repo_refresh)
return
except Exception as error:
if i < Constants.MAX_ZYPPER_REPO_REFRESH_RETRY_COUNT - 1:
self.composite_logger.log_warning("Exception on package manager refresh repo. [Exception={0}] [RetryCount={1}]".format(repr(error), str(i)))
time.sleep(pow(2, i) + 1)
else:
if Constants.ERROR_ADDED_TO_STATUS in repr(error):
error.args = error.args[:1] # remove Constants.ERROR_ADDED_TO_STATUS flag to add new message to status

error_msg = "Unable to refresh repo (retries exhausted). [{0}] [RetryCount={1}]".format(repr(error), str(i))

# Reboot if not already done
if self.status_handler.get_installation_reboot_status() == Constants.RebootStatus.COMPLETED:
error_msg = "Unable to refresh repo (retries exhausted after reboot). [{0}] [RetryCount={1}]".format(repr(error), str(i))
else:
self.composite_logger.log_warning("Setting force_reboot flag to True.")
self.force_reboot = True

self.composite_logger.log_warning(error_msg)
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.PACKAGE_MANAGER_FAILURE)

raise Exception(error_msg)

# region Get Available Updates
def invoke_package_manager(self, command):
Expand Down
6 changes: 3 additions & 3 deletions src/core/src/service_interfaces/LifecycleManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def read_extension_sequence(self):
with self.env_layer.file_system.open(self.ext_state_file_path, mode="r") as file_handle:
return json.load(file_handle)['extensionSequence']
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
self.composite_logger.log_warning("Exception on extension sequence read. [Exception={0}] [RetryCount={1}]".format(repr(error), str(i)))
time.sleep(i+1)
else:
Expand Down Expand Up @@ -115,7 +115,7 @@ def read_core_sequence(self):

return core_sequence
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
self.composite_logger.log_warning("Exception on core sequence read. [Exception={0}] [RetryCount={1}]".format(repr(error), str(i)))
time.sleep(i + 1)
else:
Expand Down Expand Up @@ -145,7 +145,7 @@ def update_core_sequence(self, completed=False):
with self.env_layer.file_system.open(self.core_state_file_path, 'w+') as file_handle:
file_handle.write(core_state_payload)
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
self.composite_logger.log_warning("Exception on core sequence update. [Exception={0}] [RetryCount={1}]".format(repr(error), str(i)))
time.sleep(i + 1)
else:
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/service_interfaces/StatusHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def __load_status_file_components(self, initial_load=False):
with self.env_layer.file_system.open(self.status_file_path, 'r') as file_handle:
status_file_data_raw = json.load(file_handle)[0] # structure is array of 1
except Exception as error:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT:
if i < Constants.MAX_FILE_OPERATION_RETRY_COUNT - 1:
time.sleep(i + 1)
else:
self.composite_logger.log_error("Unable to read status file (retries exhausted). Error: {0}.".format(repr(error)))
Expand Down
73 changes: 72 additions & 1 deletion src/core/tests/Test_CoreMain.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def test_assessment_operation_fail(self):
self.assertEquals(len(substatus_file_data), 2)
self.assertTrue(substatus_file_data[0]["name"] == Constants.PATCH_ASSESSMENT_SUMMARY)
self.assertTrue(substatus_file_data[0]["status"].lower() == Constants.STATUS_ERROR.lower())
self.assertEqual(len(json.loads(substatus_file_data[0]["formattedMessage"]["message"])["errors"]["details"]), 2)
self.assertEqual(len(json.loads(substatus_file_data[0]["formattedMessage"]["message"])["errors"]["details"]), 5)
self.assertTrue(substatus_file_data[1]["name"] == Constants.CONFIGURE_PATCHING_SUMMARY)
self.assertTrue(substatus_file_data[1]["status"].lower() == Constants.STATUS_SUCCESS.lower())
runtime.stop()
Expand Down Expand Up @@ -753,6 +753,77 @@ def test_auto_assessment_success_with_installation_in_prev_operation_on_same_seq

runtime.stop()

def test_assessment_operation_fail_after_package_manager_reboot(self):
argument_composer = ArgumentComposer()
argument_composer.operation = Constants.ASSESSMENT
runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.ZYPPER)
runtime.set_legacy_test_type('ExceptionPath')
CoreMain(argument_composer.get_composed_arguments())

# check telemetry events
self.__check_telemetry_events(runtime)

# mock rebooting
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.REQUIRED)
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED)
runtime.status_handler.is_reboot_pending = False
runtime.package_manager.force_reboot = False
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.COMPLETED)

# run coremain again
CoreMain(argument_composer.get_composed_arguments())

# check telemetry events
self.__check_telemetry_events(runtime)

# check status file
with runtime.env_layer.file_system.open(runtime.execution_config.status_file_path, 'r') as file_handle:
substatus_file_data = json.load(file_handle)[0]["status"]["substatus"]
self.assertEquals(len(substatus_file_data), 3)
self.assertTrue(substatus_file_data[0]["name"] == Constants.PATCH_ASSESSMENT_SUMMARY)
self.assertTrue(substatus_file_data[0]["status"].lower() == Constants.STATUS_ERROR.lower())
self.assertTrue(substatus_file_data[1]["name"] == Constants.PATCH_INSTALLATION_SUMMARY)
self.assertTrue(substatus_file_data[1]["status"].lower() == Constants.STATUS_TRANSITIONING.lower())
self.assertTrue(substatus_file_data[2]["name"] == Constants.CONFIGURE_PATCHING_SUMMARY)
self.assertTrue(substatus_file_data[2]["status"].lower() == Constants.STATUS_SUCCESS.lower())
runtime.stop()

def test_assessment_operation_success_after_package_manager_reboot(self):
argument_composer = ArgumentComposer()
argument_composer.operation = Constants.ASSESSMENT
runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.ZYPPER)
runtime.set_legacy_test_type('ExceptionPath')
CoreMain(argument_composer.get_composed_arguments())

# check telemetry events
self.__check_telemetry_events(runtime)

# mock rebooting
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.REQUIRED)
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED)
runtime.status_handler.is_reboot_pending = False
runtime.package_manager.force_reboot = False
runtime.status_handler.set_installation_reboot_status(Constants.RebootStatus.COMPLETED)

# run coremain again, but with success path this time
runtime.set_legacy_test_type('SuccessInstallPath')
CoreMain(argument_composer.get_composed_arguments())

# check telemetry events
self.__check_telemetry_events(runtime)

# check status file
with runtime.env_layer.file_system.open(runtime.execution_config.status_file_path, 'r') as file_handle:
substatus_file_data = json.load(file_handle)[0]["status"]["substatus"]
self.assertEquals(len(substatus_file_data), 3)
self.assertTrue(substatus_file_data[0]["name"] == Constants.PATCH_ASSESSMENT_SUMMARY)
self.assertTrue(substatus_file_data[0]["status"].lower() == Constants.STATUS_SUCCESS.lower())
self.assertTrue(substatus_file_data[1]["name"] == Constants.PATCH_INSTALLATION_SUMMARY)
self.assertTrue(substatus_file_data[1]["status"].lower() == Constants.STATUS_TRANSITIONING.lower())
self.assertTrue(substatus_file_data[2]["name"] == Constants.CONFIGURE_PATCHING_SUMMARY)
self.assertTrue(substatus_file_data[2]["status"].lower() == Constants.STATUS_SUCCESS.lower())
runtime.stop()

def __check_telemetry_events(self, runtime):
all_events = os.listdir(runtime.telemetry_writer.events_folder_path)
self.assertTrue(len(all_events) > 0)
Expand Down
7 changes: 2 additions & 5 deletions src/core/tests/Test_LifecycleManagerArc.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@ def test_read_extension_sequence_fail(self):
# file open throws exception
self.lifecycle_manager.ext_state_file_path = old_ext_state_file_path
self.runtime.env_layer.file_system.open = self.mock_file_open_throw_exception
ext_state_json = self.lifecycle_manager.read_extension_sequence()
self.assertEquals(ext_state_json, None)

self.assertRaises(Exception, self.lifecycle_manager.read_extension_sequence)

def test_read_extension_sequence_success(self):
ext_state_json = self.lifecycle_manager.read_extension_sequence()
Expand All @@ -66,8 +64,7 @@ def test_read_extension_sequence_success(self):
def test_read_core_sequence_fail(self):
# file open throws exception
self.runtime.env_layer.file_system.open = self.mock_file_open_throw_exception
core_sequence_json = self.lifecycle_manager.read_core_sequence()
self.assertEquals(core_sequence_json, None)
self.assertRaises(Exception, self.lifecycle_manager.read_core_sequence)

def test_read_core_sequence_success(self):
old_core_state_file_path = self.lifecycle_manager.core_state_file_path
Expand Down
6 changes: 2 additions & 4 deletions src/core/tests/Test_LifecycleManagerAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def test_read_extension_sequence_fail(self):
# file open throws exception
self.lifecycle_manager.ext_state_file_path = old_ext_state_file_path
self.runtime.env_layer.file_system.open = self.mock_file_open_throw_exception
ext_state_json = self.lifecycle_manager.read_extension_sequence()
self.assertEquals(ext_state_json, None)
self.assertRaises(Exception, self.lifecycle_manager.read_extension_sequence)

def test_read_extension_sequence_success(self):
ext_state_json = self.lifecycle_manager.read_extension_sequence()
Expand All @@ -62,8 +61,7 @@ def test_read_extension_sequence_success(self):
def test_read_core_sequence_fail(self):
# file open throws exception
self.runtime.env_layer.file_system.open = self.mock_file_open_throw_exception
core_sequence_json = self.lifecycle_manager.read_core_sequence()
self.assertEquals(core_sequence_json, None)
self.assertRaises(Exception, self.lifecycle_manager.read_core_sequence)

def test_read_core_sequence_success(self):
old_core_state_file_path = self.lifecycle_manager.core_state_file_path
Expand Down
2 changes: 1 addition & 1 deletion src/core/tests/Test_PatchAssessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_get_all_security_updates_fail(self):
def test_assessment_fail_with_status_update(self):
self.runtime.package_manager.refresh_repo = self.mock_refresh_repo
self.runtime.set_legacy_test_type('UnalignedPath')
self.runtime.patch_assessor.start_assessment()
self.assertRaises(Exception, self.runtime.patch_assessor.start_assessment)
with open(self.runtime.execution_config.status_file_path, 'r') as file_handle:
file_contents = json.loads(file_handle.read())
self.assertTrue('Unexpected return code (100) from package manager on command: LANG=en_US.UTF8 sudo apt-get -s dist-upgrade' in str(file_contents))
Expand Down
2 changes: 1 addition & 1 deletion src/extension/src/ActionHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def copy_config_files(self, src, dst, raise_if_not_copied=False):
shutil.copy(file_to_copy, dst)
break
except Exception as error:
if i < Constants.MAX_IO_RETRIES:
if i < Constants.MAX_IO_RETRIES - 1:
time.sleep(i + 1)
else:
error_msg = "Failed to copy file after {0} tries. [Source={1}] [Destination={2}] [Exception={3}]".format(Constants.MAX_IO_RETRIES, str(file_to_copy), str(dst), repr(error))
Expand Down
2 changes: 1 addition & 1 deletion src/extension/src/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class TelemetryEventLevel(EnumBackport):
UTC_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

# Re-try limit for file operations
MAX_IO_RETRIES = 5
MAX_IO_RETRIES = 10

# Re-try limit for verifying core process has started successfully
MAX_PROCESS_STATUS_CHECK_RETRIES = 5
Expand Down
4 changes: 2 additions & 2 deletions src/extension/src/EnvLayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def open(self, file_path, mode):
try:
return open(file_path, mode)
except Exception as error:
if i < self.retry_count:
if i < self.retry_count - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to open {0} (retries exhausted). Error: {1}.".format(str(file_path), repr(error)))
Expand Down Expand Up @@ -236,7 +236,7 @@ def write_with_retry(self, file_path_or_handle, data, mode='a+'):
file_handle.write(str(data))
break
except Exception as error:
if i < self.retry_count:
if i < self.retry_count - 1:
time.sleep(i + 1)
else:
raise Exception("Unable to write to {0} (retries exhausted). Error: {1}.".format(str(file_handle.name), repr(error)))
Expand Down

0 comments on commit e875112

Please sign in to comment.