From 7ff4fd2efaf41af1a11f429af85918269a6682a1 Mon Sep 17 00:00:00 2001 From: benank <8016617+benank@users.noreply.github.com> Date: Thu, 10 Mar 2022 09:46:27 -0800 Subject: [PATCH] Improve logging for guest agent issues (#116) Fixes: - Support codes getting truncated in status blobs - `__log_telemetry_info` in ActionHandler not working due to code execution order - Changes EventsFolderExists in `__log_telemetry_info` to EventsFolderExistedPreviously to capture its initial state before the events folder may have been created Adds: - Extra logging of the value of the guest agent supported environment variable - Environment variable code to give an idea of what failed when getting the guest agent env var to the extension (already existed in core) --- src/core/src/bootstrap/Constants.py | 6 ++++ .../core_logic/ConfigurePatchingProcessor.py | 2 +- src/core/src/core_logic/PatchAssessor.py | 2 +- src/core/src/core_logic/PatchInstaller.py | 2 +- .../src/service_interfaces/StatusHandler.py | 12 +++++-- .../src/service_interfaces/TelemetryWriter.py | 27 +++++++-------- src/core/tests/Test_StatusHandler.py | 16 +++++++++ src/extension/src/ActionHandler.py | 25 +++++++------- src/extension/src/Constants.py | 5 +++ src/extension/src/TelemetryWriter.py | 14 ++++++-- src/extension/tests/Test_ActionHandler.py | 33 ++++++++++++++++++- 11 files changed, 110 insertions(+), 34 deletions(-) diff --git a/src/core/src/bootstrap/Constants.py b/src/core/src/bootstrap/Constants.py index 2e41a7f0..50485a8e 100644 --- a/src/core/src/bootstrap/Constants.py +++ b/src/core/src/bootstrap/Constants.py @@ -270,3 +270,9 @@ class EnvLayer(EnumBackport): PRIVILEGED_OP_MARKER = "Privileged_Op_e6df678d-d09b-436a-a08a-65f2f70a6798" PRIVILEGED_OP_REBOOT = PRIVILEGED_OP_MARKER + "Reboot_Exception" PRIVILEGED_OP_EXIT = PRIVILEGED_OP_MARKER + "Exit_" + + # These are numbers instead of strings like extension because they are customer facing in the error message diagnostics + class AgentEnvVarStatusCode(EnumBackport): + AGENT_ENABLED = 0 + FAILED_TO_GET_AGENT_SUPPORTED_FEATURES = 1 + FAILED_TO_GET_TELEMETRY_KEY = 2 diff --git a/src/core/src/core_logic/ConfigurePatchingProcessor.py b/src/core/src/core_logic/ConfigurePatchingProcessor.py index 94a2413a..e46e10bf 100644 --- a/src/core/src/core_logic/ConfigurePatchingProcessor.py +++ b/src/core/src/core_logic/ConfigurePatchingProcessor.py @@ -143,4 +143,4 @@ def __raise_if_agent_incompatible(self): self.composite_logger.log_error(error_msg) raise Exception(error_msg) - self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG) + self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics())) diff --git a/src/core/src/core_logic/PatchAssessor.py b/src/core/src/core_logic/PatchAssessor.py index de67e9ac..c6fd03d1 100644 --- a/src/core/src/core_logic/PatchAssessor.py +++ b/src/core/src/core_logic/PatchAssessor.py @@ -88,5 +88,5 @@ def raise_if_agent_incompatible(self): self.composite_logger.log_error(error_msg) raise Exception(error_msg) - self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG) + self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics())) diff --git a/src/core/src/core_logic/PatchInstaller.py b/src/core/src/core_logic/PatchInstaller.py index a047c8f1..c2eeba41 100644 --- a/src/core/src/core_logic/PatchInstaller.py +++ b/src/core/src/core_logic/PatchInstaller.py @@ -107,7 +107,7 @@ def raise_if_agent_incompatible(self): self.composite_logger.log_error(error_msg) raise Exception(error_msg) - self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG) + self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics())) def install_updates(self, maintenance_window, package_manager, simulate=False): """wrapper function of installing updates""" diff --git a/src/core/src/service_interfaces/StatusHandler.py b/src/core/src/service_interfaces/StatusHandler.py index c4b40d43..d7459099 100644 --- a/src/core/src/service_interfaces/StatusHandler.py +++ b/src/core/src/service_interfaces/StatusHandler.py @@ -79,6 +79,9 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ self.__current_operation = None + # If an error message is any of these strings, it ignores the length limit (STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS) + self.__ignore_error_message_restriction_compliance_strings = [Constants.TELEMETRY_AT_AGENT_NOT_COMPATIBLE_ERROR_MSG] + # Update patch metadata summary in status for auto patching installation requests, to be reported to healthstore if (execution_config.maintenance_run_id is not None or execution_config.health_store_id is not None) and execution_config.operation.lower() == Constants.INSTALLATION.lower(): if self.__installation_reboot_status != Constants.RebootStatus.STARTED: @@ -681,12 +684,15 @@ def add_error_to_status(self, message, error_code=Constants.PatchOperationErrorC else: return - @staticmethod - def __ensure_error_message_restriction_compliance(full_message): + def __ensure_error_message_restriction_compliance(self, full_message): """ Removes line breaks, tabs and restricts message to a character limit """ message_size_limit = Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS formatted_message = re.sub(r"\s+", " ", str(full_message)) - return formatted_message[:message_size_limit-3] + '...' if len(formatted_message) > message_size_limit else formatted_message + ignore_message_restriction = any(ignore_string in formatted_message for ignore_string in self.__ignore_error_message_restriction_compliance_strings) + if ignore_message_restriction is False: + return formatted_message[:message_size_limit - 3] + '...' if len(formatted_message) > message_size_limit else formatted_message + else: + return formatted_message @staticmethod def __try_add_error(error_list, detail): diff --git a/src/core/src/service_interfaces/TelemetryWriter.py b/src/core/src/service_interfaces/TelemetryWriter.py index 2d69d2ec..92eeb77f 100644 --- a/src/core/src/service_interfaces/TelemetryWriter.py +++ b/src/core/src/service_interfaces/TelemetryWriter.py @@ -37,7 +37,7 @@ def __init__(self, env_layer, composite_logger, events_folder_path): self.__telemetry_event_counter = 1 # will be added at the end of each event sent to telemetry to assist in tracing and identifying event/message loss in telemetry self.start_time_for_event_count_throttle_check = datetime.datetime.utcnow() self.event_count = 1 - self.agent_env_var_code = 0 # code to give details on what went wrong when getting env var + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED # code to give details on what went wrong when getting env var if self.__get_events_folder_path_exists(events_folder_path): self.events_folder_path = events_folder_path @@ -128,18 +128,19 @@ def __get_agent_supports_telemetry_from_env_var(self): """ Returns True if the env var AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES has a key of ExtensionTelemetryPipeline in the list. Value of the env var looks like this: '[{ "Key": "ExtensionTelemetryPipeline", "Value": "1.0"}]' """ - self.agent_env_var_code = 0 + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED features_keyvalue_list_str = os.getenv(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR) if features_keyvalue_list_str is None: - self.composite_logger.log_error('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR)) - self.agent_env_var_code = 1 + self.composite_logger.log_debug('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR)) + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES return False features_keyvalue_list = json.loads(features_keyvalue_list_str) telemetry_supported_key_exists = any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY) if telemetry_supported_key_exists is False: - self.composite_logger.log_error('Guest agent does not support telemetry. [Error=Key not found: {0}]'.format(Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY)) - self.agent_env_var_code = 2 + self.composite_logger.log_debug('Guest agent does not support telemetry. [Error=Key not found: {0}]\nEnv var value: \n{0}'.format( + Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY, json.dumps(features_keyvalue_list, indent=4, sort_keys=True))) + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY return telemetry_supported_key_exists @@ -195,19 +196,19 @@ def __extract_agent_version_from_string(self, pattern, string): def get_telemetry_diagnostics(self): """ Returns information about the guest agent telemetry for debugging purposes. - Information message abbreviations: - AV: Agent Version (Guest Agent) - AGSV: Agent Goal State Version (Guest Agent) - EFE: Events Folder Exists (on disk) - EV: Env Var exists - EVC: Env Var Code (more detailed information on what went wrong when getting the env var) + Information message indices: + 0: Agent Version (Guest Agent) + 1: Agent Goal State Version (Guest Agent) + 2: Events Folder Exists (on disk) + 3: Env Var exists + 4: Env Var Code (more detailed information on what went wrong when getting the env var) See __get_agent_supports_telemetry_from_env_var for more information """ agent_version = self.get_agent_version() agent_goalstate_version = self.get_goal_state_agent_version() events_folder_exists = self.__get_events_folder_path_exists(self.events_folder_path) telemetry_env_var_supported = self.__get_agent_supports_telemetry_from_env_var() - return "AV:{0}, AGSV:{1}, EFE:{2}, EV:{3}, EVC:{4}".format( + return "Diagnostic-code: {0}/{1}/{2}/{3}/{4}".format( str(agent_version) if agent_version is not None else "-1", str(agent_goalstate_version) if agent_goalstate_version is not None else "-1", "1" if events_folder_exists is True else "0", diff --git a/src/core/tests/Test_StatusHandler.py b/src/core/tests/Test_StatusHandler.py index 990d28cc..50d9cae8 100644 --- a/src/core/tests/Test_StatusHandler.py +++ b/src/core/tests/Test_StatusHandler.py @@ -198,6 +198,22 @@ def test_add_error(self): self.assertEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["code"], 1) self.assertEqual(len(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"]), 1) + # Adding a long error that will not be truncated (special case) + self.runtime.status_handler.set_current_operation(Constants.INSTALLATION) + long_error_message = "{0} [Diagnostic-code: 2.2.49.2/2.6.0.2/0/1/0]".format(Constants.TELEMETRY_AT_AGENT_NOT_COMPATIBLE_ERROR_MSG) + self.assertTrue(len(long_error_message) > Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS) + self.runtime.status_handler.add_error_to_status(long_error_message, Constants.PatchOperationErrorCodes.DEFAULT_ERROR) + substatus_file_data = [] + with self.runtime.env_layer.file_system.open(self.runtime.execution_config.status_file_path, 'r') as file_handle: + substatus_file_data = json.load(file_handle)[0]["status"]["substatus"][1] + self.assertNotEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"], None) + self.assertEqual(substatus_file_data["name"], Constants.PATCH_INSTALLATION_SUMMARY) + self.assertEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["code"], 1) + self.assertEqual(len(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"]), 2) + message = json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"][0]["message"] + self.assertTrue(len(message) > Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS) + self.assertEqual(message, long_error_message) + def test_add_duplicate_error(self): # Setting operation to assessment to add all errors under assessment substatus self.runtime.status_handler.set_current_operation(Constants.ASSESSMENT) diff --git a/src/extension/src/ActionHandler.py b/src/extension/src/ActionHandler.py index 61a67ad1..5c4dd64e 100644 --- a/src/extension/src/ActionHandler.py +++ b/src/extension/src/ActionHandler.py @@ -117,30 +117,33 @@ def setup_telemetry(self): events_folder = self.ext_env_handler.events_folder if events_folder is not None: # Guest agent fully supports telemetry - self.__log_telemetry_info(telemetry_supported=True) + self.telemetry_writer.events_folder_path = events_folder + # As this is a common function used by all handler actions, setting operation_id such that it will be the same timestamp for all handler actions, which can be used for identifying all events for an operation. + # NOTE: Enable handler action will set operation_id to activity_id from config settings. And the same will be used in Core. + self.telemetry_writer.set_operation_id(self.operation_id_substitute_for_all_actions_in_telemetry) + + events_folder_previously_existed = True if not os.path.exists(events_folder): os.mkdir(events_folder) self.logger.log("Events folder path found in HandlerEnvironment but does not exist on disk. Creating now. [Path={0}][AgentVersion={1}]".format( str(events_folder), str(self.telemetry_writer.get_agent_version()))) + events_folder_previously_existed = False - self.telemetry_writer.events_folder_path = events_folder - # As this is a common function used by all handler actions, setting operation_id such that it will be the same timestamp for all handler actions, which can be used for identifying all events for an operation. - # NOTE: Enable handler action will set operation_id to activity_id from config settings. And the same will be used in Core. - self.telemetry_writer.set_operation_id(self.operation_id_substitute_for_all_actions_in_telemetry) + self.__log_telemetry_info(telemetry_supported=True, events_folder_previously_existed=events_folder_previously_existed) else: + # This line only logs to file since events_folder_path is not set in telemetry_writer self.__log_telemetry_info(telemetry_supported=False) - def __log_telemetry_info(self, telemetry_supported): + def __log_telemetry_info(self, telemetry_supported, events_folder_previously_existed=False): """ Logs detailed information about telemetry and logs an error if telemetry is not supported. """ events_folder = self.ext_env_handler.events_folder events_folder_str = str(events_folder) if events_folder is not None else "" - events_folder_exists = os.path.exists(events_folder) if events_folder is not None else False - env_var_supports_telemetry = self.telemetry_writer.is_agent_compatible() - telemetry_info = "[EventsFolder=\'{0}\'][EventsFolderExists={1}][EnvVar={2}]".format( - events_folder_str, str(events_folder_exists), env_var_supports_telemetry) + agent_env_var_code = self.telemetry_writer.agent_env_var_code + telemetry_info = "[EventsFolder={0}][EventsFolderExistedPreviously={1}][EnvVarCode={2}]".format( + events_folder_str, str(events_folder_previously_existed), str(agent_env_var_code)) - if env_var_supports_telemetry is True: + if agent_env_var_code == Constants.AgentEnvVarStatusCode.AGENT_ENABLED: telemetry_info += "[AgentVer={0}][GoalStateVer={1}]".format(self.telemetry_writer.get_agent_version(), self.telemetry_writer.get_goal_state_agent_version()) else: telemetry_info += "[AgentVer=Unknown][GoalStateVer=Unknown]" diff --git a/src/extension/src/Constants.py b/src/extension/src/Constants.py index 04ab48e2..2a4c0b0e 100644 --- a/src/extension/src/Constants.py +++ b/src/extension/src/Constants.py @@ -214,3 +214,8 @@ class ExitCode(EnumBackport): InvalidConfigSettingPropertyValue = 3564 CreateLoggerFailure = 3565 CreateStatusWriterFailure = 3566 + + class AgentEnvVarStatusCode(EnumBackport): + AGENT_ENABLED = "AGENT_ENABLED" + FAILED_TO_GET_AGENT_SUPPORTED_FEATURES = "FAILED_TO_GET_AGENT_SUPPORTED_FEATURES" + FAILED_TO_GET_TELEMETRY_KEY = "FAILED_TO_GET_TELEMETRY_KEY" diff --git a/src/extension/src/TelemetryWriter.py b/src/extension/src/TelemetryWriter.py index 073530af..eab56365 100644 --- a/src/extension/src/TelemetryWriter.py +++ b/src/extension/src/TelemetryWriter.py @@ -32,6 +32,7 @@ def __init__(self, logger, env_layer): self.logger = logger self.env_layer = env_layer self.events_folder_path = None + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED self.__operation_id = "" self.__agent_is_compatible = self.__get_agent_supports_telemetry_from_env_var() @@ -65,17 +66,24 @@ def __ensure_message_restriction_compliance(self, full_message): self.logger.log_telemetry_module_error("Error occurred while formatting message for a telemetry event. [Error={0}]".format(repr(e))) raise - @staticmethod - def __get_agent_supports_telemetry_from_env_var(): + def __get_agent_supports_telemetry_from_env_var(self): """ Returns True if the env var AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES has a key of ExtensionTelemetryPipeline in the list. Value of the env var looks like this: '[{ "Key": "ExtensionTelemetryPipeline", "Value": "1.0"}]' """ features_keyvalue_list_str = os.getenv(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR) if features_keyvalue_list_str is None: + self.logger.log_debug('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR)) + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES return False features_keyvalue_list = json.loads(features_keyvalue_list_str) - return any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY) + telemetry_supported_key_exists = any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY) + if telemetry_supported_key_exists is False: + self.logger.log_debug('Guest agent does not support telemetry. [Error=Key not found: {0}]\nEnv var value: \n{0}'.format( + Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY, json.dumps(features_keyvalue_list, indent=4, sort_keys=True))) + self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY + + return telemetry_supported_key_exists def __events_folder_exists(self): return self.events_folder_path is not None and os.path.exists(self.events_folder_path) diff --git a/src/extension/tests/Test_ActionHandler.py b/src/extension/tests/Test_ActionHandler.py index 54399dd7..6b0b14c1 100644 --- a/src/extension/tests/Test_ActionHandler.py +++ b/src/extension/tests/Test_ActionHandler.py @@ -269,7 +269,38 @@ def mock_os_getenv(name, value=None): file_read = open(self.runtime.logger.file_logger.log_file_path, "r") self.assertTrue(file_read is not None) - self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_read.read()) + file_contents = file_read.read() + self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_contents) + self.assertTrue(Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES in file_contents) + file_read.close() + + with self.assertRaises(SystemExit) as sys_exit: + self.action_handler.enable() + + os.getenv = backup_os_getenv + self.runtime.telemetry_writer = backup_telemetry_writer + self.action_handler.telemetry_writer = backup_telemetry_writer + + def test_telemetry_available_env_var_key_not_exists(self): + # agent env var is not set so telemetry is not supported + backup_os_getenv = os.getenv + backup_telemetry_writer = self.runtime.telemetry_writer + + def mock_os_getenv(name, value=None): + return '[]' + + # Re-init TelemetryWriter since the env var for compatibility is only checked on init + os.getenv = mock_os_getenv + self.runtime.telemetry_writer = TelemetryWriter(self.runtime.logger, self.runtime.env_layer) + self.action_handler.telemetry_writer = self.runtime.telemetry_writer + + self.assertTrue(self.action_handler.uninstall() == Constants.ExitCode.Okay) + + file_read = open(self.runtime.logger.file_logger.log_file_path, "r") + self.assertTrue(file_read is not None) + file_contents = file_read.read() + self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_contents) + self.assertTrue(Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY in file_contents) file_read.close() with self.assertRaises(SystemExit) as sys_exit: