Skip to content

Commit

Permalink
Improve logging for guest agent issues (#116)
Browse files Browse the repository at this point in the history
Fixes:
 - Support codes getting truncated in status blobs
 - `__log_telemetry_info` in ActionHandler not working due to code execution order
 - Changes EventsFolderExists in `__log_telemetry_info` to EventsFolderExistedPreviously to capture its initial state before the events folder may have been created


Adds:
 - Extra logging of the value of the guest agent supported environment variable
 - Environment variable code to give an idea of what failed when getting the guest agent env var to the extension (already existed in core)
  • Loading branch information
benank authored Mar 10, 2022
1 parent 31bd61d commit 7ff4fd2
Show file tree
Hide file tree
Showing 11 changed files with 110 additions and 34 deletions.
6 changes: 6 additions & 0 deletions src/core/src/bootstrap/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,9 @@ class EnvLayer(EnumBackport):
PRIVILEGED_OP_MARKER = "Privileged_Op_e6df678d-d09b-436a-a08a-65f2f70a6798"
PRIVILEGED_OP_REBOOT = PRIVILEGED_OP_MARKER + "Reboot_Exception"
PRIVILEGED_OP_EXIT = PRIVILEGED_OP_MARKER + "Exit_"

# These are numbers instead of strings like extension because they are customer facing in the error message diagnostics
class AgentEnvVarStatusCode(EnumBackport):
AGENT_ENABLED = 0
FAILED_TO_GET_AGENT_SUPPORTED_FEATURES = 1
FAILED_TO_GET_TELEMETRY_KEY = 2
2 changes: 1 addition & 1 deletion src/core/src/core_logic/ConfigurePatchingProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,4 @@ def __raise_if_agent_incompatible(self):
self.composite_logger.log_error(error_msg)
raise Exception(error_msg)

self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG)
self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics()))
2 changes: 1 addition & 1 deletion src/core/src/core_logic/PatchAssessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def raise_if_agent_incompatible(self):
self.composite_logger.log_error(error_msg)
raise Exception(error_msg)

self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG)
self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics()))

2 changes: 1 addition & 1 deletion src/core/src/core_logic/PatchInstaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def raise_if_agent_incompatible(self):
self.composite_logger.log_error(error_msg)
raise Exception(error_msg)

self.composite_logger.log(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG)
self.composite_logger.log("{0} [{1}]".format(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG, self.telemetry_writer.get_telemetry_diagnostics()))

def install_updates(self, maintenance_window, package_manager, simulate=False):
"""wrapper function of installing updates"""
Expand Down
12 changes: 9 additions & 3 deletions src/core/src/service_interfaces/StatusHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ

self.__current_operation = None

# If an error message is any of these strings, it ignores the length limit (STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS)
self.__ignore_error_message_restriction_compliance_strings = [Constants.TELEMETRY_AT_AGENT_NOT_COMPATIBLE_ERROR_MSG]

# Update patch metadata summary in status for auto patching installation requests, to be reported to healthstore
if (execution_config.maintenance_run_id is not None or execution_config.health_store_id is not None) and execution_config.operation.lower() == Constants.INSTALLATION.lower():
if self.__installation_reboot_status != Constants.RebootStatus.STARTED:
Expand Down Expand Up @@ -681,12 +684,15 @@ def add_error_to_status(self, message, error_code=Constants.PatchOperationErrorC
else:
return

@staticmethod
def __ensure_error_message_restriction_compliance(full_message):
def __ensure_error_message_restriction_compliance(self, full_message):
""" Removes line breaks, tabs and restricts message to a character limit """
message_size_limit = Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS
formatted_message = re.sub(r"\s+", " ", str(full_message))
return formatted_message[:message_size_limit-3] + '...' if len(formatted_message) > message_size_limit else formatted_message
ignore_message_restriction = any(ignore_string in formatted_message for ignore_string in self.__ignore_error_message_restriction_compliance_strings)
if ignore_message_restriction is False:
return formatted_message[:message_size_limit - 3] + '...' if len(formatted_message) > message_size_limit else formatted_message
else:
return formatted_message

@staticmethod
def __try_add_error(error_list, detail):
Expand Down
27 changes: 14 additions & 13 deletions src/core/src/service_interfaces/TelemetryWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, env_layer, composite_logger, events_folder_path):
self.__telemetry_event_counter = 1 # will be added at the end of each event sent to telemetry to assist in tracing and identifying event/message loss in telemetry
self.start_time_for_event_count_throttle_check = datetime.datetime.utcnow()
self.event_count = 1
self.agent_env_var_code = 0 # code to give details on what went wrong when getting env var
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED # code to give details on what went wrong when getting env var

if self.__get_events_folder_path_exists(events_folder_path):
self.events_folder_path = events_folder_path
Expand Down Expand Up @@ -128,18 +128,19 @@ def __get_agent_supports_telemetry_from_env_var(self):
""" Returns True if the env var AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES has a key of
ExtensionTelemetryPipeline in the list. Value of the env var looks like this:
'[{ "Key": "ExtensionTelemetryPipeline", "Value": "1.0"}]' """
self.agent_env_var_code = 0
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED
features_keyvalue_list_str = os.getenv(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR)
if features_keyvalue_list_str is None:
self.composite_logger.log_error('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR))
self.agent_env_var_code = 1
self.composite_logger.log_debug('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR))
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES
return False

features_keyvalue_list = json.loads(features_keyvalue_list_str)
telemetry_supported_key_exists = any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY)
if telemetry_supported_key_exists is False:
self.composite_logger.log_error('Guest agent does not support telemetry. [Error=Key not found: {0}]'.format(Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY))
self.agent_env_var_code = 2
self.composite_logger.log_debug('Guest agent does not support telemetry. [Error=Key not found: {0}]\nEnv var value: \n{0}'.format(
Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY, json.dumps(features_keyvalue_list, indent=4, sort_keys=True)))
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY

return telemetry_supported_key_exists

Expand Down Expand Up @@ -195,19 +196,19 @@ def __extract_agent_version_from_string(self, pattern, string):

def get_telemetry_diagnostics(self):
""" Returns information about the guest agent telemetry for debugging purposes.
Information message abbreviations:
AV: Agent Version (Guest Agent)
AGSV: Agent Goal State Version (Guest Agent)
EFE: Events Folder Exists (on disk)
EV: Env Var exists
EVC: Env Var Code (more detailed information on what went wrong when getting the env var)
Information message indices:
0: Agent Version (Guest Agent)
1: Agent Goal State Version (Guest Agent)
2: Events Folder Exists (on disk)
3: Env Var exists
4: Env Var Code (more detailed information on what went wrong when getting the env var)
See __get_agent_supports_telemetry_from_env_var for more information
"""
agent_version = self.get_agent_version()
agent_goalstate_version = self.get_goal_state_agent_version()
events_folder_exists = self.__get_events_folder_path_exists(self.events_folder_path)
telemetry_env_var_supported = self.__get_agent_supports_telemetry_from_env_var()
return "AV:{0}, AGSV:{1}, EFE:{2}, EV:{3}, EVC:{4}".format(
return "Diagnostic-code: {0}/{1}/{2}/{3}/{4}".format(
str(agent_version) if agent_version is not None else "-1",
str(agent_goalstate_version) if agent_goalstate_version is not None else "-1",
"1" if events_folder_exists is True else "0",
Expand Down
16 changes: 16 additions & 0 deletions src/core/tests/Test_StatusHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,22 @@ def test_add_error(self):
self.assertEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["code"], 1)
self.assertEqual(len(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"]), 1)

# Adding a long error that will not be truncated (special case)
self.runtime.status_handler.set_current_operation(Constants.INSTALLATION)
long_error_message = "{0} [Diagnostic-code: 2.2.49.2/2.6.0.2/0/1/0]".format(Constants.TELEMETRY_AT_AGENT_NOT_COMPATIBLE_ERROR_MSG)
self.assertTrue(len(long_error_message) > Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS)
self.runtime.status_handler.add_error_to_status(long_error_message, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
substatus_file_data = []
with self.runtime.env_layer.file_system.open(self.runtime.execution_config.status_file_path, 'r') as file_handle:
substatus_file_data = json.load(file_handle)[0]["status"]["substatus"][1]
self.assertNotEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"], None)
self.assertEqual(substatus_file_data["name"], Constants.PATCH_INSTALLATION_SUMMARY)
self.assertEqual(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["code"], 1)
self.assertEqual(len(json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"]), 2)
message = json.loads(substatus_file_data["formattedMessage"]["message"])["errors"]["details"][0]["message"]
self.assertTrue(len(message) > Constants.STATUS_ERROR_MSG_SIZE_LIMIT_IN_CHARACTERS)
self.assertEqual(message, long_error_message)

def test_add_duplicate_error(self):
# Setting operation to assessment to add all errors under assessment substatus
self.runtime.status_handler.set_current_operation(Constants.ASSESSMENT)
Expand Down
25 changes: 14 additions & 11 deletions src/extension/src/ActionHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,30 +117,33 @@ def setup_telemetry(self):
events_folder = self.ext_env_handler.events_folder
if events_folder is not None:
# Guest agent fully supports telemetry
self.__log_telemetry_info(telemetry_supported=True)

self.telemetry_writer.events_folder_path = events_folder
# As this is a common function used by all handler actions, setting operation_id such that it will be the same timestamp for all handler actions, which can be used for identifying all events for an operation.
# NOTE: Enable handler action will set operation_id to activity_id from config settings. And the same will be used in Core.
self.telemetry_writer.set_operation_id(self.operation_id_substitute_for_all_actions_in_telemetry)

events_folder_previously_existed = True
if not os.path.exists(events_folder):
os.mkdir(events_folder)
self.logger.log("Events folder path found in HandlerEnvironment but does not exist on disk. Creating now. [Path={0}][AgentVersion={1}]".format(
str(events_folder), str(self.telemetry_writer.get_agent_version())))
events_folder_previously_existed = False

self.telemetry_writer.events_folder_path = events_folder
# As this is a common function used by all handler actions, setting operation_id such that it will be the same timestamp for all handler actions, which can be used for identifying all events for an operation.
# NOTE: Enable handler action will set operation_id to activity_id from config settings. And the same will be used in Core.
self.telemetry_writer.set_operation_id(self.operation_id_substitute_for_all_actions_in_telemetry)
self.__log_telemetry_info(telemetry_supported=True, events_folder_previously_existed=events_folder_previously_existed)
else:
# This line only logs to file since events_folder_path is not set in telemetry_writer
self.__log_telemetry_info(telemetry_supported=False)

def __log_telemetry_info(self, telemetry_supported):
def __log_telemetry_info(self, telemetry_supported, events_folder_previously_existed=False):
""" Logs detailed information about telemetry and logs an error if telemetry is not supported. """
events_folder = self.ext_env_handler.events_folder
events_folder_str = str(events_folder) if events_folder is not None else ""
events_folder_exists = os.path.exists(events_folder) if events_folder is not None else False
env_var_supports_telemetry = self.telemetry_writer.is_agent_compatible()
telemetry_info = "[EventsFolder=\'{0}\'][EventsFolderExists={1}][EnvVar={2}]".format(
events_folder_str, str(events_folder_exists), env_var_supports_telemetry)
agent_env_var_code = self.telemetry_writer.agent_env_var_code
telemetry_info = "[EventsFolder={0}][EventsFolderExistedPreviously={1}][EnvVarCode={2}]".format(
events_folder_str, str(events_folder_previously_existed), str(agent_env_var_code))

if env_var_supports_telemetry is True:
if agent_env_var_code == Constants.AgentEnvVarStatusCode.AGENT_ENABLED:
telemetry_info += "[AgentVer={0}][GoalStateVer={1}]".format(self.telemetry_writer.get_agent_version(), self.telemetry_writer.get_goal_state_agent_version())
else:
telemetry_info += "[AgentVer=Unknown][GoalStateVer=Unknown]"
Expand Down
5 changes: 5 additions & 0 deletions src/extension/src/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,8 @@ class ExitCode(EnumBackport):
InvalidConfigSettingPropertyValue = 3564
CreateLoggerFailure = 3565
CreateStatusWriterFailure = 3566

class AgentEnvVarStatusCode(EnumBackport):
AGENT_ENABLED = "AGENT_ENABLED"
FAILED_TO_GET_AGENT_SUPPORTED_FEATURES = "FAILED_TO_GET_AGENT_SUPPORTED_FEATURES"
FAILED_TO_GET_TELEMETRY_KEY = "FAILED_TO_GET_TELEMETRY_KEY"
14 changes: 11 additions & 3 deletions src/extension/src/TelemetryWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(self, logger, env_layer):
self.logger = logger
self.env_layer = env_layer
self.events_folder_path = None
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.AGENT_ENABLED
self.__operation_id = ""
self.__agent_is_compatible = self.__get_agent_supports_telemetry_from_env_var()

Expand Down Expand Up @@ -65,17 +66,24 @@ def __ensure_message_restriction_compliance(self, full_message):
self.logger.log_telemetry_module_error("Error occurred while formatting message for a telemetry event. [Error={0}]".format(repr(e)))
raise

@staticmethod
def __get_agent_supports_telemetry_from_env_var():
def __get_agent_supports_telemetry_from_env_var(self):
""" Returns True if the env var AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES has a key of
ExtensionTelemetryPipeline in the list. Value of the env var looks like this:
'[{ "Key": "ExtensionTelemetryPipeline", "Value": "1.0"}]' """
features_keyvalue_list_str = os.getenv(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR)
if features_keyvalue_list_str is None:
self.logger.log_debug('Failed to get guest agent supported features from env var. [Var={0}]'.format(Constants.AZURE_GUEST_AGENT_EXTENSION_SUPPORTED_FEATURES_ENV_VAR))
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES
return False

features_keyvalue_list = json.loads(features_keyvalue_list_str)
return any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY)
telemetry_supported_key_exists = any(kv_pair for kv_pair in features_keyvalue_list if kv_pair['Key'] == Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY)
if telemetry_supported_key_exists is False:
self.logger.log_debug('Guest agent does not support telemetry. [Error=Key not found: {0}]\nEnv var value: \n{0}'.format(
Constants.TELEMETRY_EXTENSION_PIPELINE_SUPPORTED_KEY, json.dumps(features_keyvalue_list, indent=4, sort_keys=True)))
self.agent_env_var_code = Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY

return telemetry_supported_key_exists

def __events_folder_exists(self):
return self.events_folder_path is not None and os.path.exists(self.events_folder_path)
Expand Down
33 changes: 32 additions & 1 deletion src/extension/tests/Test_ActionHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,38 @@ def mock_os_getenv(name, value=None):

file_read = open(self.runtime.logger.file_logger.log_file_path, "r")
self.assertTrue(file_read is not None)
self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_read.read())
file_contents = file_read.read()
self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_contents)
self.assertTrue(Constants.AgentEnvVarStatusCode.FAILED_TO_GET_AGENT_SUPPORTED_FEATURES in file_contents)
file_read.close()

with self.assertRaises(SystemExit) as sys_exit:
self.action_handler.enable()

os.getenv = backup_os_getenv
self.runtime.telemetry_writer = backup_telemetry_writer
self.action_handler.telemetry_writer = backup_telemetry_writer

def test_telemetry_available_env_var_key_not_exists(self):
# agent env var is not set so telemetry is not supported
backup_os_getenv = os.getenv
backup_telemetry_writer = self.runtime.telemetry_writer

def mock_os_getenv(name, value=None):
return '[]'

# Re-init TelemetryWriter since the env var for compatibility is only checked on init
os.getenv = mock_os_getenv
self.runtime.telemetry_writer = TelemetryWriter(self.runtime.logger, self.runtime.env_layer)
self.action_handler.telemetry_writer = self.runtime.telemetry_writer

self.assertTrue(self.action_handler.uninstall() == Constants.ExitCode.Okay)

file_read = open(self.runtime.logger.file_logger.log_file_path, "r")
self.assertTrue(file_read is not None)
file_contents = file_read.read()
self.assertTrue(Constants.TELEMETRY_AT_AGENT_COMPATIBLE_MSG in file_contents)
self.assertTrue(Constants.AgentEnvVarStatusCode.FAILED_TO_GET_TELEMETRY_KEY in file_contents)
file_read.close()

with self.assertRaises(SystemExit) as sys_exit:
Expand Down

0 comments on commit 7ff4fd2

Please sign in to comment.