Skip to content

Commit

Permalink
Adjusting gpu sample rate and providing functionality to adjust the r…
Browse files Browse the repository at this point in the history
…ate (#72)

Co-authored-by: Ubuntu <azhpcuser@ndv47a0ea000004.tues0csbwrlutgwvusiyrhkngc.jx.internal.cloudapp.net>
  • Loading branch information
rafsalas19 and Ubuntu authored Nov 29, 2023
1 parent 7c29b96 commit 89abfbe
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 9 deletions.
9 changes: 9 additions & 0 deletions moneo.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ def deploy_worker(self, hosts_file, max_threads=16): # noqa: C901
else:
cmd = cmd + ' false'
cmd = cmd + " \"\""
# gpu sample rate
cmd = cmd + " " + str(args.gpu_sample_rate)
print(cmd)
if self.args.custom_metrics_file_path:
print('-Custom exporter enabled-')
logging.info('Custom exporter enabled')
Expand Down Expand Up @@ -402,6 +405,12 @@ def parallel_ssh_check():
'--custom_metrics_file_path',
type=str,
help='The path of the custom metrics file.')
parser.add_argument(
'--gpu_sample_rate',
type=int,
choices=[1, 2, 3, 10],
help='Number of samples per minute for GPU monitoring. Valid options are 1,2,3,10', default=2)

args = parser.parse_args()

logging.basicConfig(
Expand Down
17 changes: 12 additions & 5 deletions src/worker/exporters/nvidia_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,14 @@ def __init__(self):
self,
fieldIds=dcgm_config['publishFieldIds'],
ignoreList=dcgm_config['ignoreList'],
# updateFrequency=(dcgm_config['prometheusPublishInterval'] *
# 1000000) / 2,
updateFrequency=100000,
updateFrequency=int(1000000 / dcgm_config['prometheusPublishInterval']),
maxKeepAge=1800.0,
fieldGroupName='dcgm_exporter_{}'.format(os.getpid()),
hostname=dcgm_config['dcgmHostName'],
)
logging.info(
'DCGM sample interval: {} microseconds'
.format(int(1000000 / dcgm_config['prometheusPublishInterval'])))
self.InitConnection()
self.InitGauges()
signal.signal(signal.SIGUSR1, self.jobID_update_flag)
Expand Down Expand Up @@ -309,7 +310,13 @@ def parse_dcgm_cli():
action='store_true',
help='Enable profile metrics (Tensor Core,FP16,FP32,FP64 activity).'
'Addition of profile metrics encurs additional overhead on computer nodes.')

parser.add_argument(
'-s',
'--sample_per_min',
type=int,
default=2,
choices=[1, 2, 3, 10],
help='Samples per minute. Default 2')
args = dcgm_client_cli_parser.run_parser(parser)
# add profiling metrics if flag enabled
if (args.profiler_metrics):
Expand All @@ -327,7 +334,7 @@ def parse_dcgm_cli():
else:
dcgm_config['dcgmHostName'] = args.hostname
dcgm_config['prometheusPort'] = args.publish_port
dcgm_config['prometheusPublishInterval'] = args.interval
dcgm_config['prometheusPublishInterval'] = int(args.sample_per_min)
dcgm_config['publishFieldIds'] = field_ids
dcgm_config['sendUuid'] = True
dcgm_config['jobId'] = None
Expand Down
15 changes: 11 additions & 4 deletions src/worker/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,26 @@ START_PUBLISHER=$2

PUBLISHER_AUTH=${3:-""}

CUTSOM_METRICS_PATH=${4:-""}
GPU_SAMPLE_RATE=$4

CUTSOM_METRICS_PATH=${5:-""}
#shutdown previous instances
$WORK_DIR/shutdown.sh false

# start exporters
if [ -e "/dev/nvidiactl" ];
then
if [ -z $GPU_SAMPLE_RATE ]; then
GPU_SAMPLE_RATE=2
fi

nohup nv-hostengine </dev/null >/dev/null 2>&1 &

if [ $PROF_METRICS = true ];
then
nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -m </dev/null >/dev/null 2>&1 &
nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -m -s $GPU_SAMPLE_RATE </dev/null >/dev/null 2>&1 &
else
nohup python3 $WORK_DIR/exporters/nvidia_exporter.py </dev/null >/dev/null 2>&1 &
nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -s $GPU_SAMPLE_RATE </dev/null >/dev/null 2>&1 &
fi
elif [ -e '/dev/kfd' ];
then
Expand All @@ -35,7 +42,7 @@ then
nohup python3 $WORK_DIR/exporters/custom_exporter.py --custom_metrics_file_path $CUTSOM_METRICS_PATH </dev/null >/dev/null 2>&1 &
fi

if [ -n "$START_PUBLISHER" ]
if [ -n "$START_PUBLISHER" ] && [ "$START_PUBLISHER" != "false" ];
then
if [[ $START_PUBLISHER == "geneva" || $START_PUBLISHER == "azure_monitor" || $START_PUBLISHER == "managed_prometheus" ]]
then
Expand Down

0 comments on commit 89abfbe

Please sign in to comment.