Skip to content

Commit

Permalink
Merge pull request #161 from Azure/garamrak-addTimeTakenLogs
Browse files Browse the repository at this point in the history
Add logs to get time taken by assessment job and install updates job
  • Loading branch information
GAURAVRAMRAKHYANI authored Feb 15, 2023
2 parents d192d35 + 6ca8ce0 commit 006d225
Show file tree
Hide file tree
Showing 10 changed files with 387 additions and 8 deletions.
22 changes: 22 additions & 0 deletions src/core/src/bootstrap/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,28 @@ class ConfigSettings(EnumBackport):
# patch versions for healthstore when there is no maintenance run id
PATCH_VERSION_UNKNOWN = "UNKNOWN"

# Strings used in perf logs
class PerfLogTrackerParams:
TASK = "task"
TASK_STATUS = "task_status"
PACKAGE_MANAGER = "package_manager"
NUMBER_OF_TRIALS = "number_of_trials"
ERROR_MSG = "error_msg"
INSTALLED_PATCH_COUNT = "installed_patch_count"
PATCH_OPERATION_SUCCESSFUL = "patch_operation_successful"
MAINTENANCE_WINDOW = "maintenance_window"
PERC_MAINTENANCE_WINDOW_USED = "perc_maintenance_window_used"
MAINTENANCE_WINDOW_EXCEEDED = "maintenance_window_exceeded"
START_TIME = "start_time"
END_TIME = "end_time"
TIME_TAKEN = "time_taken"
MACHINE_INFO = "machine_info"
MESSAGE = "message"

class TaskStatus(EnumBackport):
SUCCEEDED = "succeeded"
FAILED = "failed"

# Patch Modes for Configure Patching
class PatchModes(EnumBackport):
IMAGE_DEFAULT = "ImageDefault"
Expand Down
22 changes: 22 additions & 0 deletions src/core/src/core_logic/MaintenanceWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,25 @@ def is_package_install_time_available(self, remaining_time_in_minutes=None):
else:
self.composite_logger.log_warning("Time Remaining: " + str(timedelta(seconds=int(remaining_time_in_minutes * 60))) + ", Cutoff time: " + str(timedelta(minutes=cutoff_time_in_minutes)) + " [Out of time!]")
return False

def get_percentage_maintenance_window_used(self):
"""Calculate percentage of maintenance window used"""
try:
current_time = self.env_layer.datetime.datetime_utcnow()
start_time = self.env_layer.datetime.utc_to_standard_datetime(self.start_time)
if current_time < start_time:
raise Exception("Start time {0} is greater than current time {1}".format(str(start_time), str(current_time)))
dur = datetime.datetime.strptime(self.duration, "%H:%M:%S")
dura = timedelta(hours=dur.hour, minutes=dur.minute, seconds=dur.second)
total_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(dura)
elapsed_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(current_time - start_time)
percent_maintenance_window_used = (elapsed_time_in_minutes / total_time_in_minutes) * 100
except Exception as error:
error_msg = "Error calculating percentage of maintenance window used."
self.composite_logger.log_error("\n" + error_msg)
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
if Constants.ERROR_ADDED_TO_STATUS not in repr(error):
error.args = (error.args, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS))
raise

return percent_maintenance_window_used
15 changes: 15 additions & 0 deletions src/core/src/core_logic/PatchAssessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import shutil
import time
from core.src.bootstrap.Constants import Constants
from core.src.core_logic.Stopwatch import Stopwatch


class PatchAssessor(object):
Expand All @@ -34,6 +35,7 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ
self.status_handler = status_handler
self.lifecycle_manager = lifecycle_manager
self.package_manager = package_manager
self.package_manager_name = self.package_manager.__class__.__name__
self.assessment_state_file_path = os.path.join(self.execution_config.config_folder, Constants.ASSESSMENT_STATE_FILE)

def start_assessment(self):
Expand All @@ -49,6 +51,9 @@ def start_assessment(self):
self.composite_logger.log('\nStarting patch assessment...')
self.write_assessment_state() # success / failure does not matter, only that an attempt started

self.stopwatch = Stopwatch(self.env_layer, self.telemetry_writer, self.composite_logger)
self.stopwatch.start()

self.status_handler.set_assessment_substatus_json(status=Constants.STATUS_TRANSITIONING)
self.composite_logger.log("\nMachine Id: " + self.env_layer.platform.node())
self.composite_logger.log("Activity Id: " + self.execution_config.activity_id)
Expand All @@ -57,11 +62,13 @@ def start_assessment(self):
self.composite_logger.log("\n\nGetting available patches...")
self.package_manager.refresh_repo()
self.status_handler.reset_assessment_data()
number_of_tries = 0

for i in range(0, Constants.MAX_ASSESSMENT_RETRY_COUNT):
try:
if self.lifecycle_manager is not None:
self.lifecycle_manager.lifecycle_status_check() # may terminate the code abruptly, as designed
number_of_tries = number_of_tries + 1
packages, package_versions = self.package_manager.get_all_updates()
self.telemetry_writer.write_event("Full assessment: " + str(packages), Constants.TelemetryEventLevel.Verbose)
self.status_handler.set_package_assessment_status(packages, package_versions)
Expand All @@ -81,15 +88,23 @@ def start_assessment(self):
else:
error_msg = 'Error retrieving available patches: ' + repr(error)
self.composite_logger.log_error(error_msg)
self.write_assessment_perf_logs(number_of_tries, Constants.TaskStatus.FAILED, error_msg)
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
if Constants.ERROR_ADDED_TO_STATUS not in repr(error):
error.args = (error.args, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS))
self.status_handler.set_assessment_substatus_json(status=Constants.STATUS_ERROR)
raise

self.write_assessment_perf_logs(number_of_tries, Constants.TaskStatus.SUCCEEDED, "")
self.composite_logger.log("\nPatch assessment completed.\n")
return True

def write_assessment_perf_logs(self, number_of_tries, task_status, error_msg):
assessment_perf_log = {Constants.PerfLogTrackerParams.TASK: Constants.ASSESSMENT, Constants.PerfLogTrackerParams.TASK_STATUS: str(task_status),
Constants.PerfLogTrackerParams.ERROR_MSG: error_msg, Constants.PerfLogTrackerParams.PACKAGE_MANAGER: self.package_manager_name,
Constants.PerfLogTrackerParams.NUMBER_OF_TRIALS: str(number_of_tries)}
self.stopwatch.stop_and_write_telemetry(str(assessment_perf_log))

def raise_if_telemetry_unsupported(self):
if self.lifecycle_manager.get_vm_cloud_type() == Constants.VMCloudType.ARC and self.execution_config.operation not in [Constants.ASSESSMENT, Constants.INSTALLATION]:
self.composite_logger.log("Skipping telemetry compatibility check for Arc cloud type when operation is not manual")
Expand Down
27 changes: 25 additions & 2 deletions src/core/src/core_logic/PatchInstaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import os
import time
from core.src.bootstrap.Constants import Constants

from core.src.core_logic.Stopwatch import Stopwatch

class PatchInstaller(object):
"""" Wrapper class for a single patch installation operation """
Expand All @@ -33,6 +33,7 @@ def __init__(self, env_layer, execution_config, composite_logger, telemetry_writ
self.lifecycle_manager = lifecycle_manager

self.package_manager = package_manager
self.package_manager_name = self.package_manager.__class__.__name__
self.package_filter = package_filter
self.maintenance_window = maintenance_window
self.reboot_manager = reboot_manager
Expand All @@ -51,6 +52,9 @@ def start_installation(self, simulate=False):

self.composite_logger.log('\nStarting patch installation...')

self.stopwatch = Stopwatch(self.env_layer, self.telemetry_writer, self.composite_logger)
self.stopwatch.start()

self.composite_logger.log("\nMachine Id: " + self.env_layer.platform.node())
self.composite_logger.log("Activity Id: " + self.execution_config.activity_id)
self.composite_logger.log("Operation request time: " + self.execution_config.start_time + ", Maintenance Window Duration: " + self.execution_config.duration)
Expand All @@ -72,22 +76,26 @@ def start_installation(self, simulate=False):

# Install Updates
installed_update_count, update_run_successful, maintenance_window_exceeded = self.install_updates(maintenance_window, package_manager, simulate)

retry_count = 1
# Repeat patch installation if flagged as required and time is available
if not maintenance_window_exceeded and package_manager.get_package_manager_setting(Constants.PACKAGE_MGR_SETTING_REPEAT_PATCH_OPERATION, False):
self.composite_logger.log("\nInstalled update count (first round): " + str(installed_update_count))
self.composite_logger.log("\nPatch installation run will be repeated as the package manager recommended it --------------------------------------------->")
package_manager.set_package_manager_setting(Constants.PACKAGE_MGR_SETTING_REPEAT_PATCH_OPERATION, False) # Resetting
new_installed_update_count, update_run_successful, maintenance_window_exceeded = self.install_updates(maintenance_window, package_manager, simulate)
installed_update_count += new_installed_update_count
retry_count = retry_count + 1

if package_manager.get_package_manager_setting(Constants.PACKAGE_MGR_SETTING_REPEAT_PATCH_OPERATION, False): # We should not see this again
error_msg = "Unexpected repeated package manager update occurred. Please re-run the update deployment."
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.PACKAGE_MANAGER_FAILURE)
self.write_installer_perf_logs(update_run_successful, installed_update_count, retry_count, maintenance_window, maintenance_window_exceeded, Constants.TaskStatus.FAILED, error_msg)
raise Exception(error_msg, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS))

self.composite_logger.log("\nInstalled update count: " + str(installed_update_count) + " (including dependencies)")

self.write_installer_perf_logs(update_run_successful, installed_update_count, retry_count, maintenance_window, maintenance_window_exceeded, Constants.TaskStatus.SUCCEEDED, "")

# Reboot as per setting and environment state
reboot_manager.start_reboot_if_required_and_time_available(maintenance_window.get_remaining_time_in_minutes(None, False))
maintenance_window_exceeded = maintenance_window_exceeded or reboot_manager.maintenance_window_exceeded_flag
Expand All @@ -98,6 +106,21 @@ def start_installation(self, simulate=False):

return overall_patch_installation_successful

def write_installer_perf_logs(self, patch_operation_successful, installed_patch_count, number_of_rounds, maintenance_window, maintenance_window_exceeded, task_status, error_msg):
perc_maintenance_window_used = -1

try:
perc_maintenance_window_used = maintenance_window.get_percentage_maintenance_window_used()
except Exception as error:
self.composite_logger.log_debug("Error in writing patch installation performance logs. Error is: " + repr(error))

patch_installation_perf_log = {Constants.PerfLogTrackerParams.TASK: Constants.INSTALLATION, Constants.PerfLogTrackerParams.TASK_STATUS: str(task_status), Constants.PerfLogTrackerParams.ERROR_MSG: error_msg,
Constants.PerfLogTrackerParams.PACKAGE_MANAGER: self.package_manager_name, Constants.PerfLogTrackerParams.PATCH_OPERATION_SUCCESSFUL: str(patch_operation_successful),
Constants.PerfLogTrackerParams.INSTALLED_PATCH_COUNT: str(installed_patch_count), Constants.PerfLogTrackerParams.NUMBER_OF_TRIALS: str(number_of_rounds),
Constants.PerfLogTrackerParams.MAINTENANCE_WINDOW: str(maintenance_window.duration), Constants.PerfLogTrackerParams.PERC_MAINTENANCE_WINDOW_USED: str(perc_maintenance_window_used),
Constants.PerfLogTrackerParams.MAINTENANCE_WINDOW_EXCEEDED: str(maintenance_window_exceeded)}
self.stopwatch.stop_and_write_telemetry(str(patch_installation_perf_log))

def raise_if_telemetry_unsupported(self):
if self.lifecycle_manager.get_vm_cloud_type() == Constants.VMCloudType.ARC and self.execution_config.operation not in [Constants.ASSESSMENT, Constants.INSTALLATION]:
self.composite_logger.log("Skipping telemetry compatibility check for Arc cloud type when operation is not manual")
Expand Down
70 changes: 70 additions & 0 deletions src/core/src/core_logic/Stopwatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2023 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Requires Python 2.7+

from core.src.bootstrap.Constants import Constants

class Stopwatch(object):
"""Implements the stopwatch logic"""

class StopwatchException(Constants.EnumBackport):
# Stopwatch exception strings
STARTED_ALREADY = "Stopwatch is already started"
NOT_STARTED = "Stopwatch is not started"
STOPPED_ALREADY = "Stopwatch is already stoppped"

def __init__(self, env_layer, telemetry_writer, composite_logger):
self.env_layer = env_layer
self.telemetry_writer = telemetry_writer
self.composite_logger = composite_logger
self.start_time = None
self.end_time = None
self.time_taken = None
self.task_details = None

def __del__(self):
# if start_time is None that means Stopwatch is not started and hence no need to log
# call stop only if end_time is None otherwise stop() is already called.
if self.start_time is not None and self.end_time is None:
self.stop()
self.set_task_details("")
self.composite_logger.log("Stopwatch details before instance is destroyed: " + str(self.task_details))

def start(self):
if self.start_time is not None:
self.composite_logger.log_debug(str(Stopwatch.StopwatchException.STARTED_ALREADY))
self.start_time = self.env_layer.datetime.datetime_utcnow()
self.end_time = None
self.time_taken = None
self.task_details = None

def stop(self):
if self.end_time is not None:
self.composite_logger.log_debug(str(Stopwatch.StopwatchException.STOPPED_ALREADY))
self.end_time = self.env_layer.datetime.datetime_utcnow()
if self.start_time is None:
self.composite_logger.log_debug(str(Stopwatch.StopwatchException.NOT_STARTED))
self.start_time = self.end_time

self.time_taken = self.env_layer.datetime.total_minutes_from_time_delta(self.end_time - self.start_time)

def stop_and_write_telemetry(self, message):
self.stop()
self.set_task_details(message)
self.composite_logger.log("Stopwatch details: " + str(self.task_details))

def set_task_details(self, message):
self.task_details = {Constants.PerfLogTrackerParams.START_TIME: str(self.start_time), Constants.PerfLogTrackerParams.END_TIME: str(self.end_time), Constants.PerfLogTrackerParams.TIME_TAKEN: str(self.time_taken),
Constants.PerfLogTrackerParams.MACHINE_INFO: self.telemetry_writer.machine_info, Constants.PerfLogTrackerParams.MESSAGE: str(message)}
8 changes: 4 additions & 4 deletions src/core/src/service_interfaces/TelemetryWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
class TelemetryWriter(object):
"""Class for writing telemetry data to data transports"""


def __init__(self, env_layer, composite_logger, events_folder_path, telemetry_supported):
self.env_layer = env_layer
self.composite_logger = composite_logger
Expand All @@ -45,7 +46,7 @@ def __init__(self, env_layer, composite_logger, events_folder_path, telemetry_su
self.__is_telemetry_supported = telemetry_supported and self.events_folder_path is not None

self.write_event('Started Linux patch core operation.', Constants.TelemetryEventLevel.Informational)
self.write_machine_config_info()
self.set_machine_config_info()

def write_config_info(self, config_info, config_type='unknown'):
# Configuration info
Expand Down Expand Up @@ -73,16 +74,15 @@ def write_package_info(self, package_name, package_ver, package_size, install_du
self.write_event(message, Constants.TelemetryEventLevel.Informational)

# Composed payload
def write_machine_config_info(self):
def set_machine_config_info(self):
# Machine info - sent only once at the start of the run
machine_info = {
self.machine_info = {
'platform_name': str(self.env_layer.platform.linux_distribution()[0]),
'platform_version': str(self.env_layer.platform.linux_distribution()[1]),
'machine_cpu': self.get_machine_processor(),
'machine_arch': str(self.env_layer.platform.machine()),
'disk_type': self.get_disk_type()
}
return self.write_config_info(machine_info, 'machine_config')

def write_execution_error(self, cmd, code, output):
# Expected to log any errors from a cmd execution, including package manager execution errors
Expand Down
28 changes: 28 additions & 0 deletions src/core/tests/Test_MaintenanceWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,33 @@ def test_check_available_time_after_duration_complete(self):
self.assertEqual(runtime.maintenance_window.is_package_install_time_available(), False)
runtime.stop()

def test_get_percentage_maintenance_window_used(self):
argument_composer = ArgumentComposer()
argument_composer.start_time = (datetime.datetime.utcnow() - datetime.timedelta(hours=0, minutes=18)).strftime("%Y-%m-%dT%H:%M:%S.9999Z")
argument_composer.maximum_duration = "PT3H"
runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True)
perc_maintenance_window_used = runtime.maintenance_window.get_percentage_maintenance_window_used()
# 18 minutes of maintenance window used out of 3 hours (180 minutes). So, it should be 10%.
# The value should be slightly greater than 10 as it takes some time to trigger the method get_percentage_maintenance_window_used
self.assertGreaterEqual(perc_maintenance_window_used, 10)
self.assertLessEqual(perc_maintenance_window_used, 11)
runtime.stop()

def test_get_percentage_maintenance_window_used_Fail(self):
argument_composer = ArgumentComposer()
# ZeroDivisionError should be thrown as duration is 0
argument_composer.maximum_duration = "PT0H"
runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True)
self.assertRaises(Exception, runtime.maintenance_window.get_percentage_maintenance_window_used)
runtime.stop()

def test_get_percentage_maintenance_window_used_start_time_greater_exception(self):
argument_composer = ArgumentComposer()
# Setting start time 1 hour later than current time
argument_composer.start_time = (datetime.datetime.utcnow() + datetime.timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%S.9999Z")
runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True)
self.assertRaises(Exception, runtime.maintenance_window.get_percentage_maintenance_window_used)
runtime.stop()

if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 006d225

Please sign in to comment.