From 89abfbe76bc95c11da248c2e1fbcefd27fcb4cb7 Mon Sep 17 00:00:00 2001 From: rafsalas19 <70273488+rafsalas19@users.noreply.github.com> Date: Wed, 29 Nov 2023 12:15:40 -0500 Subject: [PATCH] Adjusting gpu sample rate and providing functionality to adjust the rate (#72) Co-authored-by: Ubuntu --- moneo.py | 9 +++++++++ src/worker/exporters/nvidia_exporter.py | 17 ++++++++++++----- src/worker/start.sh | 15 +++++++++++---- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/moneo.py b/moneo.py index 7afb675..0045690 100644 --- a/moneo.py +++ b/moneo.py @@ -179,6 +179,9 @@ def deploy_worker(self, hosts_file, max_threads=16): # noqa: C901 else: cmd = cmd + ' false' cmd = cmd + " \"\"" + # gpu sample rate + cmd = cmd + " " + str(args.gpu_sample_rate) + print(cmd) if self.args.custom_metrics_file_path: print('-Custom exporter enabled-') logging.info('Custom exporter enabled') @@ -402,6 +405,12 @@ def parallel_ssh_check(): '--custom_metrics_file_path', type=str, help='The path of the custom metrics file.') + parser.add_argument( + '--gpu_sample_rate', + type=int, + choices=[1, 2, 3, 10], + help='Number of samples per minute for GPU monitoring. Valid options are 1,2,3,10', default=2) + args = parser.parse_args() logging.basicConfig( diff --git a/src/worker/exporters/nvidia_exporter.py b/src/worker/exporters/nvidia_exporter.py index 25fe17c..c90901f 100644 --- a/src/worker/exporters/nvidia_exporter.py +++ b/src/worker/exporters/nvidia_exporter.py @@ -134,13 +134,14 @@ def __init__(self): self, fieldIds=dcgm_config['publishFieldIds'], ignoreList=dcgm_config['ignoreList'], - # updateFrequency=(dcgm_config['prometheusPublishInterval'] * - # 1000000) / 2, - updateFrequency=100000, + updateFrequency=int(1000000 / dcgm_config['prometheusPublishInterval']), maxKeepAge=1800.0, fieldGroupName='dcgm_exporter_{}'.format(os.getpid()), hostname=dcgm_config['dcgmHostName'], ) + logging.info( + 'DCGM sample interval: {} microseconds' + .format(int(1000000 / dcgm_config['prometheusPublishInterval']))) self.InitConnection() self.InitGauges() signal.signal(signal.SIGUSR1, self.jobID_update_flag) @@ -309,7 +310,13 @@ def parse_dcgm_cli(): action='store_true', help='Enable profile metrics (Tensor Core,FP16,FP32,FP64 activity).' 'Addition of profile metrics encurs additional overhead on computer nodes.') - + parser.add_argument( + '-s', + '--sample_per_min', + type=int, + default=2, + choices=[1, 2, 3, 10], + help='Samples per minute. Default 2') args = dcgm_client_cli_parser.run_parser(parser) # add profiling metrics if flag enabled if (args.profiler_metrics): @@ -327,7 +334,7 @@ def parse_dcgm_cli(): else: dcgm_config['dcgmHostName'] = args.hostname dcgm_config['prometheusPort'] = args.publish_port - dcgm_config['prometheusPublishInterval'] = args.interval + dcgm_config['prometheusPublishInterval'] = int(args.sample_per_min) dcgm_config['publishFieldIds'] = field_ids dcgm_config['sendUuid'] = True dcgm_config['jobId'] = None diff --git a/src/worker/start.sh b/src/worker/start.sh index ff7aadf..d427f01 100755 --- a/src/worker/start.sh +++ b/src/worker/start.sh @@ -8,19 +8,26 @@ START_PUBLISHER=$2 PUBLISHER_AUTH=${3:-""} -CUTSOM_METRICS_PATH=${4:-""} +GPU_SAMPLE_RATE=$4 + +CUTSOM_METRICS_PATH=${5:-""} #shutdown previous instances $WORK_DIR/shutdown.sh false # start exporters if [ -e "/dev/nvidiactl" ]; then + if [ -z $GPU_SAMPLE_RATE ]; then + GPU_SAMPLE_RATE=2 + fi + nohup nv-hostengine /dev/null 2>&1 & + if [ $PROF_METRICS = true ]; then - nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -m /dev/null 2>&1 & + nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -m -s $GPU_SAMPLE_RATE /dev/null 2>&1 & else - nohup python3 $WORK_DIR/exporters/nvidia_exporter.py /dev/null 2>&1 & + nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -s $GPU_SAMPLE_RATE /dev/null 2>&1 & fi elif [ -e '/dev/kfd' ]; then @@ -35,7 +42,7 @@ then nohup python3 $WORK_DIR/exporters/custom_exporter.py --custom_metrics_file_path $CUTSOM_METRICS_PATH /dev/null 2>&1 & fi -if [ -n "$START_PUBLISHER" ] +if [ -n "$START_PUBLISHER" ] && [ "$START_PUBLISHER" != "false" ]; then if [[ $START_PUBLISHER == "geneva" || $START_PUBLISHER == "azure_monitor" || $START_PUBLISHER == "managed_prometheus" ]] then