diff --git a/README.md b/README.md index 5c80526..5e22390 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,20 @@ Get the code: Note: If you are using an [Azure Ubuntu HPC-AI](https://github.com/Azure/azhpc-images) VM image you can find the Moneo in this path: /opt/azurehpc/tools/Moneo +### Configuration File ### + +The [moneo_config.json](./moneo_config.json) file can be used to specify certain deployment settings prior to moneo deployment. + +There are 4 groups of configurations: + + 1. exporter_conf - This applies to all deployments. See the following settings: + - gpu_sample_interval - Sample rate per minute for Nvidia GPU exporter. Choices are [1, 2, 30, 60, 120, 600]. with 60 samples per minute being default. + - gpu_profiling - Switches on additional profile metrics (Tensor, FP16, FP32, and FP64). Choices are true/false with false as default. + - Note: These settings may have an impact on performance. Default settings were chosen to minimize impact. + 2. prom_config - This group of settings applies to the Headless deployment method. Refer to [Headless Deployment Guide](./docs/HeadlessDeployment.md) for usage. + 3. geneva_config - Applies to Geneva deployement. Refer to [Geneva deployment](./docs/GenevaAgent.MD) for usage. + 4. publisher_config - Applies to both Geneva and Azure Monitor agent deployment methods see [Geneva deployment](./docs/GenevaAgent.MD) or [Azure Monitor Agent deployment](./docs/AzureMonitorAgent.md) for usage. + ### Prefered Moneo Deployment ### The prefered way to deploy Moneo is the headless method using Azure Managaed Grafana and Prometheus resources. @@ -185,7 +199,7 @@ Note: For more options check the Moneo help menu 1. For Managed Grafana (headless) deployment - Verify that the user managed identity is assigned to the VM resource. - - Verify the prerequisite configure file (`Moneo/src/worker/publisher/config/managed_prom_config.json`) is configured correctly on each worker node. + - Verify the prerequisite configure file (`Moneo/moneo_config.json`) is configured correctly on each worker node. - On the worker nodes verify functionality of prometheus agent remote write: - Check prometheus docker with `sudo docker logs prometheus | grep 'Done replaying WAL'` It will have the result like this: diff --git a/dockerfile/moneo-exporter-nvidia.dockerfile b/dockerfile/moneo-exporter-nvidia.dockerfile index 7e96f37..7cab8ca 100644 --- a/dockerfile/moneo-exporter-nvidia.dockerfile +++ b/dockerfile/moneo-exporter-nvidia.dockerfile @@ -6,8 +6,6 @@ ARG BRANCH_OR_TAG=main ENV DCGM_VERSION=3.1.1 ENV OFED_VERSION=23.07-0.5.1.2 -ENV PROFILING false -ENV GPU_SAMPLE_RATE 2 # Install dependencies RUN apt-get update -y \ @@ -43,6 +41,9 @@ RUN cd /tmp && \ RUN git config --global advice.detachedHead false RUN git clone --branch ${BRANCH_OR_TAG} https://github.com/Azure/Moneo.git +# Set up tmp space for Moneo +RUN mkdir -p /tmp/moneo-worker + # Install DCGM WORKDIR Moneo/src/worker RUN sudo bash install/nvidia.sh @@ -50,4 +51,4 @@ RUN sudo bash install/nvidia.sh # Set EntryPoint COPY dockerfile/moneo-exporter-nvidia_entrypoint.sh . RUN chmod +x moneo-exporter-nvidia_entrypoint.sh -CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} ${GPU_SAMPLE_RATE} +CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh diff --git a/dockerfile/moneo-exporter-nvidia_entrypoint.sh b/dockerfile/moneo-exporter-nvidia_entrypoint.sh index 425c893..00baefa 100755 --- a/dockerfile/moneo-exporter-nvidia_entrypoint.sh +++ b/dockerfile/moneo-exporter-nvidia_entrypoint.sh @@ -1,22 +1,16 @@ #!/bin/bash set -e -enable_profiling=$1 -gpu_sample_rate=$2 -ethernet_dev_name=$3 +ethernet_dev_name=$1 # Start NVIDIA, Net and Node Exporter echo "Starting NVIDIA, Net and Node Exporter" -if [ $enable_profiling = true ]; then - python3 exporters/nvidia_exporter.py -m -s $gpu_sample_rate & -else - python3 exporters/nvidia_exporter.py -s $gpu_sample_rate & -fi +python3 exporters/nvidia_exporter.py & python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband & -if [-n $ethernet_dev_name]; then +if [ -n "$ethernet_dev_name" ]; then python3 exporters/node_exporter.py -e $ethernet_dev_name & else python3 exporters/node_exporter.py & diff --git a/docs/AzureMonitorAgent.md b/docs/AzureMonitorAgent.md index a84804a..0d34b1b 100644 --- a/docs/AzureMonitorAgent.md +++ b/docs/AzureMonitorAgent.md @@ -8,7 +8,7 @@ Prequisites: 1. An Azure Monitor Metrics (Application Insights) resource, please enable alerting on custom metric dimensions by refering this [document](https://learn.microsoft.com/en-us/azure/azure-monitor/app/pre-aggregated-metrics-log-metrics#custom-metrics-dimensions-and-pre-aggregation) to restore the metrics dimentions.(Lead to a extra cost) 2. PSSH installed on manager nodes. 3. Ensure passwordless ssh is installed in you environment. -4. Config publisher config file in `Moneo/src/worker/publisher/config/publisher_config.json`. +4. Config publisher config file in `Moneo/moneo_config.json`. Note: You can obtain your connection string from the Application Insights pages you created in the Azure portal. ``` { diff --git a/docs/HeadlessDeployment.md b/docs/HeadlessDeployment.md index 1e25892..da0f88c 100644 --- a/docs/HeadlessDeployment.md +++ b/docs/HeadlessDeployment.md @@ -36,13 +36,13 @@ Follow steps outlined in [Infrastructure deployment](../deploy_managed_infra/REA 3. Skip to step 5. Note: This step can be performed in parallel using pssh. Reference step 4 for start and stop commands. -3. Modify the managed prometheus config file in `Moneo/src/worker/publisher/config/managed_prom_config.json`. +3. Modify the managed prometheus config file in `Moneo/moneo_config.json`. - Reference the user managed identity created during infrastructure deployment to get the "identity client id" - Reference the Managed Prometheus resource created during infrastructure deployment to get the "metrics ingestion endpoint" - The config file modifcations must be distributed to the Moneo directories on all workers. ```json - { + "prom_config": { "IDENTITY_CLIENT_ID": "", "INGESTION_ENDPOINT": "" } diff --git a/docs/ManagedPrometheusAgent.md b/docs/ManagedPrometheusAgent.md index 521801d..c99bdee 100644 --- a/docs/ManagedPrometheusAgent.md +++ b/docs/ManagedPrometheusAgent.md @@ -18,11 +18,11 @@ This guide will provide step-by-step instructions on how to to publish your exp - Click add at the bottom of the open blade. 3. PSSH installed on manager nodes. 4. Ensure passwordless ssh is installed in you environment. -5. Config managed prometheus config file in `Moneo/src/worker/publisher/config/managed_prom_config.json`. +5. Config managed prometheus config file in `Moneo/moneo_config.json`. Note: You can obtain your IDENTITY_CLIENT_ID in your indentity resource page and your metrics ingestion endpoint from the AWM pages you created in the Azure portal. ``` json - { + "prom_config": { "IDENTITY_CLIENT_ID": "", "INGESTION_ENDPOINT": "" } diff --git a/linux_service/README.md b/linux_service/README.md index 049e319..3ec68c7 100644 --- a/linux_service/README.md +++ b/linux_service/README.md @@ -45,17 +45,18 @@ Configuration/Installation is only required once. After that is complete the Lin 1. Configuration and installation of the Linux service is done with the following command: ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"``` - - Note: If using Azure monitor or Geneva add an extra argument "./start_moneo_services.sh azure_monitor" or "./configure_service.sh geneva" respectively. + - Note: If using the Azure AI/HPC VM market place image, this step is already completed for managed prometheus deployment + - Note: If using Azure monitor or Geneva add an extra argument "./configure_service.sh azure_monitor" or "./configure_service.sh geneva" respectively. - Note: Geneva authentication is user managed identity "umi" by default, you can choose to change to "cert" method by modifiying [the start script](./configure_service.sh) "PUBLISHER_AUTH" variable. 2. For Azure Monitor or Managed Prometheus methods if you have not yet modified the configuration files reference the following: - For Azure Managed Prometheus: - - modify [managed_prom_config.json](../src/worker/publisher/config) and copy the file to the compute nodes. - - ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/src/worker/publisher/config/managed_prom_config.json /opt/azurehpc/tools/Moneo/src/worker/publisher/config``` + - modify [moneo_config.json](../moneo_config.json) and copy the file to the compute nodes. + - ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/moneo_config.json /opt/azurehpc/tools/Moneo``` - Lastly check that that the managed user identity used to set up Managed Prometheus (Azure role assignments) is assigned to your VMSS. - For Azure Monitor: - modify the connection string of "azure_monitor_agent_config" section and copy the file to the compute nodes. - - ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/src/worker/publisher/config/publisher_config.json /opt/azurehpc/tools/Moneo/src/worker/publisher/config``` + - ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/moneo_config.json /opt/azurehpc/tools/Moneo``` ### Launch Services ### @@ -84,10 +85,10 @@ Stopping services is the same command for all methods. Assuming configuration files have been updated and user managed ID applied if necessary (Managed Prometheus) reference these commands for the work flow: -- Configuration/Install: +- Configuration/Install (not needed for market place image, using managed Prometheus): ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"``` - Extra Configure step for AZ Monitor and/or Managed Prometheus - ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/src/worker/publisher/config/ /opt/azurehpc/tools/Moneo/src/worker/publisher/config``` + ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/moneo_config.json /opt/azurehpc/tools/Moneo``` - Start ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh"``` Note: diff --git a/linux_service/start_moneo_services.sh b/linux_service/start_moneo_services.sh index 5d05dfd..0f8a942 100755 --- a/linux_service/start_moneo_services.sh +++ b/linux_service/start_moneo_services.sh @@ -61,6 +61,9 @@ function proc_check(){ echo "All Services Running" exit 0 } +# stop nvidia exporter in the event there was a config change +sudo systemctl stop moneo@nvidia_exporter.service 2> /dev/null +sleep 2 # wait a bit for the exporter to stop $MONEO_PATH/linux_service/moneo_prestart.sh $MONEO_PATH 2> /dev/null diff --git a/moneo.py b/moneo.py index 2c4aac3..5e8b732 100644 --- a/moneo.py +++ b/moneo.py @@ -150,12 +150,6 @@ def deploy_worker(self, hosts_file, max_threads=16): # noqa: C901 print('-Starting metric exporters on workers-') logging.info('Starting metric exporters on workers') cmd = '/tmp/moneo-worker/start.sh' - if self.args.profiler_metrics: - print('-Profiling enabled-') - logging.info('Profiling enabled') - cmd = cmd + ' true' - else: - cmd = cmd + ' false' if self.args.launch_publisher: agent = self.args.launch_publisher if agent == 'geneva' and not self.args.publisher_auth: @@ -183,8 +177,8 @@ def deploy_worker(self, hosts_file, max_threads=16): # noqa: C901 else: cmd = cmd + ' false' cmd = cmd + " \"\"" - # gpu sample rate + ethernet device - cmd = cmd + " " + str(args.gpu_sample_rate) + " " + args.ethernet_device + # ethernet device + cmd = cmd + " " + args.ethernet_device if self.args.custom_metrics_file_path: print('-Custom exporter enabled-') logging.info('Custom exporter enabled') @@ -201,6 +195,10 @@ def deploy_work_docker(self, hosts_file, max_threads=16): logging.info('Deploying docker container to workers') out = pscp(copy_path, destination_dir, hosts_file, user=self.args.user) logging.info(out) + copy_path = './moneo_config.json' + destination_dir = '/tmp/moneo-worker' + out = pscp(copy_path, destination_dir, hosts_file, user=self.args.user) + logging.info(out) out = pssh(cmd='/tmp/moneo-worker/deploy_docker.sh', hosts_file=hosts_file, max_threads=max_threads, user=self.args.user) logging.info(out) @@ -356,13 +354,6 @@ def parallel_ssh_check(): default='full', nargs="?", help='Type of deployment/shutdown. Choices: {manager,workers,full}. Default: full.') - parser.add_argument( - '-p', - '--profiler_metrics', - action='store_true', - default=False, - help='Enable profile metrics (Tensor Core,FP16,FP32,FP64 activity).' - 'Addition of profile metrics encurs additional overhead on computer nodes.') parser.add_argument( '-r', '--container', @@ -408,11 +399,6 @@ def parallel_ssh_check(): '--custom_metrics_file_path', type=str, help='The path of the custom metrics file.') - parser.add_argument( - '--gpu_sample_rate', - type=int, - choices=[1, 2, 30, 60, 120, 600], - help='Number of samples per minute for GPU monitoring. Valid options are 1,2,3,10', default=60) parser.add_argument( '--ethernet_device', type=str, diff --git a/moneo_config.json b/moneo_config.json index 04537b8..1ef6252 100644 --- a/moneo_config.json +++ b/moneo_config.json @@ -1,5 +1,8 @@ - { + "exporter_conf":{ + "gpu_sample_interval": "60", + "gpu_profiling": "false" + }, "prom_config":{ "IDENTITY_CLIENT_ID": "", "INGESTION_ENDPOINT": "" diff --git a/src/worker/deploy_docker.sh b/src/worker/deploy_docker.sh index 26e1f3a..8793544 100755 --- a/src/worker/deploy_docker.sh +++ b/src/worker/deploy_docker.sh @@ -2,15 +2,14 @@ IMAGE=azmoneo/moneo-exporter:nvidia CONT_NAME=moneo-exporter-nvidia -PROFILING=$1 if [ -e "/dev/nvidiactl" ]; then docker pull $IMAGE docker rm --force $CONT_NAME && \ docker run --name=$CONT_NAME --net=host --restart=unless-stopped \ - -e PROFILING=$PROFILING --rm --runtime=nvidia \ - --cap-add SYS_ADMIN -v /sys:/hostsys/ -itd $IMAGE + --rm --runtime=nvidia \ + --cap-add SYS_ADMIN -v /sys:/hostsys/ -v /tmp/moneo-worker/moneo_config.json:/tmp/moneo-worker/moneo_config.json -itd $IMAGE else echo 'No Nvidia devices found Docker deployment canceled' diff --git a/src/worker/exporters/nvidia_exporter.py b/src/worker/exporters/nvidia_exporter.py index afaf478..42f846e 100644 --- a/src/worker/exporters/nvidia_exporter.py +++ b/src/worker/exporters/nvidia_exporter.py @@ -3,7 +3,7 @@ import time import signal import logging - +import json import prometheus_client sys.path.append('/usr/local/dcgm/bindings/python3') @@ -276,15 +276,40 @@ def Loop(self): pass +def get_custom_config(): + try: + with open('/tmp/moneo-worker/moneo_config.json') as f: + mon_config = json.load(f) + + sample_per_min = int(mon_config['exporter_conf']['gpu_sample_interval']) + sample_intervals = [1, 2, 30, 60, 120, 600] + + if sample_per_min not in sample_intervals: + mon_config['exporter_conf']['gpu_sample_interval'] = 60 + else: + mon_config['exporter_conf']['gpu_sample_interval'] = sample_per_min + + if (mon_config['exporter_conf']['gpu_profiling']).lower() == "true": + mon_config['exporter_conf']['gpu_profiling'] = True + else: + mon_config['exporter_conf']['gpu_profiling'] = False + return mon_config + except Exception: + mon_config = {'exporter_conf': {'gpu_sample_interval': 60, 'gpu_profiling': False}} + return mon_config + + def init_config(): global dcgm_config + mon_config = get_custom_config() dcgm_config = { 'exit': False, 'ignoreList': [], 'dcgmHostName': None, 'prometheusPort': None, - 'prometheusPublishInterval': None, + 'prometheusPublishInterval': mon_config['exporter_conf']['gpu_sample_interval'], 'publishFieldIds': None, + 'profilerMetrics': mon_config['exporter_conf']['gpu_profiling'], 'last_value': {} } @@ -304,22 +329,9 @@ def parse_dcgm_cli(): publish_port=8000, log_level='INFO', ) - parser.add_argument( - '-m', - '--profiler_metrics', - action='store_true', - help='Enable profile metrics (Tensor Core,FP16,FP32,FP64 activity).' - 'Addition of profile metrics encurs additional overhead on computer nodes.') - parser.add_argument( - '-s', - '--sample_per_min', - type=int, - default=60, - choices=[1, 2, 30, 60, 120, 600], - help='Samples per minute. Default 60') args = dcgm_client_cli_parser.run_parser(parser) # add profiling metrics if flag enabled - if (args.profiler_metrics): + if (dcgm_config['profilerMetrics']): args.field_ids.extend(DCGM_PROF_FIELDS) field_ids = dcgm_client_cli_parser.get_field_ids(args) numeric_log_level = dcgm_client_cli_parser.get_log_level(args) @@ -334,11 +346,9 @@ def parse_dcgm_cli(): else: dcgm_config['dcgmHostName'] = args.hostname dcgm_config['prometheusPort'] = args.publish_port - dcgm_config['prometheusPublishInterval'] = int(args.sample_per_min) dcgm_config['publishFieldIds'] = field_ids dcgm_config['sendUuid'] = True dcgm_config['jobId'] = None - dcgm_config['profilerMetrics'] = args.profiler_metrics logging.basicConfig( level=numeric_log_level, filemode=filemode, diff --git a/src/worker/publisher/config/geneva_config.json b/src/worker/publisher/config/geneva_config.json deleted file mode 100644 index de18ce9..0000000 --- a/src/worker/publisher/config/geneva_config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "AccountName": "", - "MDMEndPoint": "", - "UmiObjectId": "" -} diff --git a/src/worker/publisher/config/managed_prom_config.json b/src/worker/publisher/config/managed_prom_config.json deleted file mode 100644 index ea5e43b..0000000 --- a/src/worker/publisher/config/managed_prom_config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "IDENTITY_CLIENT_ID": "", - "INGESTION_ENDPOINT": "" -} diff --git a/src/worker/publisher/config/publisher_config.json b/src/worker/publisher/config/publisher_config.json deleted file mode 100644 index b56df1f..0000000 --- a/src/worker/publisher/config/publisher_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "common_config": { - "metrics_ports": "8000,8001,8002", - "metrics_namespace": "", - "interval": "20" - }, - "geneva_agent_config": { - "metrics_account": "" - }, - "azure_monitor_agent_config": { - "connection_string": "" - } -} diff --git a/src/worker/publisher/metrics_publisher.py b/src/worker/publisher/metrics_publisher.py index 919adb3..ae012d8 100644 --- a/src/worker/publisher/metrics_publisher.py +++ b/src/worker/publisher/metrics_publisher.py @@ -94,7 +94,7 @@ def get_publisher_metrics_config(): Returns: config(dict): The geneva metrics configuration """ - with open('/tmp/moneo-worker/publisher/config/moneor_config.json') as f: + with open('/tmp/moneo-worker/moneo_config.json') as f: config = json.load(f) return config diff --git a/src/worker/start.sh b/src/worker/start.sh index 1335f7a..79cf936 100755 --- a/src/worker/start.sh +++ b/src/worker/start.sh @@ -2,35 +2,22 @@ WORK_DIR=$(dirname "${BASH_SOURCE[0]}") -PROF_METRICS=$1 +START_PUBLISHER=$1 -START_PUBLISHER=$2 +PUBLISHER_AUTH=${2:-""} -PUBLISHER_AUTH=${3:-""} +ETH_DEV=${3:-""} -GPU_SAMPLE_RATE=$4 +CUTSOM_METRICS_PATH=${4:-""} -ETH_DEV=${5:-""} - -CUTSOM_METRICS_PATH=${6:-""} #shutdown previous instances $WORK_DIR/shutdown.sh false # start exporters if [ -e "/dev/nvidiactl" ]; then - if [ -z $GPU_SAMPLE_RATE ]; then - GPU_SAMPLE_RATE=2 - fi - nohup nv-hostengine /dev/null 2>&1 & - - if [ $PROF_METRICS = true ]; - then - nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -m -s $GPU_SAMPLE_RATE /dev/null 2>&1 & - else - nohup python3 $WORK_DIR/exporters/nvidia_exporter.py -s $GPU_SAMPLE_RATE /dev/null 2>&1 & - fi + nohup python3 $WORK_DIR/exporters/nvidia_exporter.py /dev/null 2>&1 & elif [ -e '/dev/kfd' ]; then nohup /opt/rocm/rdc/bin/rdcd -u /dev/null 2>&1 &