diff --git a/README.md b/README.md index a8aeb1e..bafec1c 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ Note: For more options check the Moneo help menu - Slurm epilog/prolog integration: [Slurm example](./examples/slurm/README.md) - To deploy moneo-worker inside container: [Moneo-exporter](./docs/Moneo-exporter.md) - To integrate Moneo with Azure App Insights dashboard see: [Azure Monitor](./docs/AzureMonitorAgent.md) +- For Geneva ingestion (internal Microsoft) see: [Geneva](./docs/GenevaAgent.MD) ## Known Issues ## diff --git a/deploy_managed_infra/managed_infra_template.json b/deploy_managed_infra/managed_infra_template.json index fbd2895..b512118 100644 --- a/deploy_managed_infra/managed_infra_template.json +++ b/deploy_managed_infra/managed_infra_template.json @@ -99,92 +99,92 @@ { "record": "average_dcgm_gpu_temp", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_gpu_temp", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_gpu_temp", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)" }, { "record": "average_dcgm_memory_temp", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_memory_temp) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_memory_temp", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_memory_temp) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_memory_temp", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_memory_temp) by (instance, subscription, cluster, job_id)" }, { "record": "average_dcgm_sm_clock", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_sm_clock) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_sm_clock", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_sm_clock) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_sm_clock", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_sm_clock) by (instance, subscription, cluster, job_id)" }, { "record": "average_dcgm_memory_clock", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_memory_clock) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_memory_clock", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_memory_clock) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_memory_clock", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_memory_clock) by (instance, subscription, cluster, job_id)" }, { "record": "average_dcgm_gpu_utilization", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_gpu_utilization", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_gpu_utilization", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)" }, { "record": "average_dcgm_mem_copy_utilization", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_mem_copy_utilization", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_mem_copy_utilization", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)" } ], "interval": "PT1M" @@ -213,7 +213,7 @@ { "record": "max_dcgm_power_usage", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_power_usage) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_power_usage", @@ -223,47 +223,47 @@ { "record": "average_dcgm_total_energy_consumption", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)" }, { "record": "max_dcgm_total_energy_consumption", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)" }, { "record": "min_dcgm_total_energy_consumption", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)" }, { "record": "average_ib_port_xmit_data", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(ib_port_xmit_data) by (instance, subscription, cluster, job_id)" }, { "record": "max_ib_port_xmit_data", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(ib_port_xmit_data) by (instance, subscription, cluster, job_id)" }, { "record": "min_ib_port_xmit_data", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(ib_port_xmit_data) by (instance, subscription, cluster, job_id)" }, { "record": "average_ib_port_rcv_data", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "avg(ib_port_rcv_data) by (instance, subscription, cluster, job_id)" }, { "record": "max_ib_port_rcv_data", "enabled": true, - "expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "max(ib_port_rcv_data) by (instance, subscription, cluster, job_id)" }, { "record": "min_ib_port_rcv_data", "enabled": true, - "expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)" + "expression": "min(ib_port_rcv_data) by (instance, subscription, cluster, job_id)" } ], "interval": "PT1M" diff --git a/docs/GenevaAgent.MD b/docs/GenevaAgent.MD index 35b1e53..0d78c33 100644 --- a/docs/GenevaAgent.MD +++ b/docs/GenevaAgent.MD @@ -1,11 +1,12 @@ -Geneva Agent User Guide -===== -Description ------ +# Geneva Agent User Guide # + +## Description ## + This guide will walk you through how to share your exporter metrics with Azure using the Geneva Agent. Note: Currently this feature is private preview and is turned off by default. As of now only internal Microsoft Azure subscriptions are whitelisted, and we recommend you use azure monitor instead, Detailed please refer to this doc: [Azure Monitor Agent](AzureMonitorAgent.md) -Prequisites: +## Prequisites ## + 1. Prepare Authentication for geneva agent: a. User Managed Identity (umi) @@ -19,14 +20,18 @@ Prequisites: b. Certificate (cert) - Create Private key pem and public key pem files by: + ```bash openssl genrsa 2048 > mdm-key.pem openssl req -x509 -new -key gcskey.pem -out mdm-cert.pem ``` + - Print out and get the certicate thumbprint: + ``` openssl x509 -in mdm-cert.pem -noout -sha1 -fingerprint ``` + - Replace above key-cert pairs with `src/worker/publisher/config/mdm-key.pem` and `src/worker/publisher/config/mdm-cert.pem` - Register the thumbprint on geneva portal with a metricsPublisher role. 2. Currently the only supported OS's are Ubuntu 20.04+ and Mariner. @@ -34,8 +39,8 @@ Prequisites: 4. PSSH installed on manager nodes. 5. Ensure passwordless ssh is installed in you environment. -Steps ------ +## Steps ## + 1. Ensure that all prequisites are met. 2. deploy Moneo - Full deployment with umi auth: diff --git a/linux_service/README.md b/linux_service/README.md index ab666db..16f090e 100644 --- a/linux_service/README.md +++ b/linux_service/README.md @@ -18,6 +18,8 @@ Three launch methods provided: 3. Launch exporters and an [Azure Monitor](../docs/AzureMonitorAgent.md) publisher. - Before launch you must modify the "azure_monitor_agent_config" section of [publisher_config](../src/worker/publisher/config/publisher_config.json) file with the Azure Monitor workspace connection string. +There is one additional method for internal Msft use that exports to Geneva. This method is similar to Azure Monitor method but uses a Geneva agent container to export. Reference the [Moneo Geneva Docs](../docs/GenevaAgent.MD). Ensure all prequisites are met. + This guide will walk you through how to set up Linux services for Moneo exporters. ## Prerequisites ## @@ -39,11 +41,12 @@ Below are the prereqs needed: ### Configuration and Installation ### -Configuration/Installation is only required once. Afte that is complete the Linux services can be started and stopped as desired. +Configuration/Installation is only required once. After that is complete the Linux services can be started and stopped as desired. 1. Configuration and installation of the Linux service is done with the following command: ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"``` - - If You will only be launching the exporters without AZ monitor or Managed Prometheus Continue to the Launch Services section else continue. + - Note: If using Azure monitor or Geneva add an extra argument "./start_moneo_services.sh azure_monitor" or "./configure_service.sh geneva" respectively. + - Note: Geneva authentication is user managed identity "umi" by default, you can choose to change to "cert" method by modifiying [the start script](./configure_service.sh) "PUBLISHER_AUTH" variable. 2. For Azure Monitor or Managed Prometheus methods if you have not yet modified the configuration files reference the following: - For Azure Managed Prometheus: @@ -58,9 +61,11 @@ Configuration/Installation is only required once. Afte that is complete the Linu The [start_moneo_services.sh](./start_moneo_services.sh) script is used to start the Linux services once configuration/installation is complete. -#### Exporters with Azure Monitor #### +#### Exporters with Azure Monitor or Geneva(internal Msft) #### -```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh true"``` +```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh azure_monitor"``` + or +```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh geneva"``` #### Exporters with Managed Prometheus #### @@ -75,12 +80,13 @@ Stopping services is the same command for all methods. Assuming configuration files have been updated and user managed ID applied if necessary (Managed Prometheus) reference these commands for the work flow: -- Configuration/Install: +- Configuration/Install: ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"``` - Extra Configure step for AZ Monitor and/or Managed Prometheus ```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/src/worker/publisher/config/ /opt/azurehpc/tools/Moneo/src/worker/publisher/config``` - Start ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh"``` + Note: - Stop ```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/stop_moneo_services.sh"``` diff --git a/linux_service/configure_service.sh b/linux_service/configure_service.sh index 4c64aa5..73b5510 100755 --- a/linux_service/configure_service.sh +++ b/linux_service/configure_service.sh @@ -1,5 +1,20 @@ #!/bin/bash +# Managed Prometheus deployment: ./configure_service.sh +# Azure Monitor: ./configure_service.sh azure_monitor +# Geneva (internal msft): ./configure_service.sh geneva + +PublisherMethod=$1 + +if [[ -n $PublisherMethod ]]; then + if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then + echo "PublisherMethod is valid: $PublisherMethod" + else + echo "PublisherMethod $PublisherMethod is not one of the valid choices {azure_monitor, geneva}." + exit 1 + fi +fi + MONEO_PATH=/opt/azurehpc/tools/Moneo if [[ ! -d "$MONEO_PATH" ]]; @@ -9,11 +24,16 @@ then fi # replace the moneo path place holder with actaul moneo path and Move service file to systemd directory -sed "s##$MONEO_PATH#g" $MONEO_PATH/linux_service/moneo@.service > /etc/systemd/system/moneo@.service +cp $MONEO_PATH/linux_service/moneo@.service /etc/systemd/system/moneo@.service echo "configuring publisher service" -sed "s##$MONEO_PATH#g;" $MONEO_PATH/linux_service/moneo_publisher.service > /etc/systemd/system/moneo_publisher.service - -$MONEO_PATH/src/worker/install/install.sh azure_monitor +if [[ "$PublisherMethod" == "geneva" ]]; then + # writes to the same file location as Azure monitor + cp $MONEO_PATH/linux_service/geneva_publisher.service /etc/systemd/system/moneo_publisher.service + $MONEO_PATH/src/worker/install/install.sh geneva +else + cp $MONEO_PATH/linux_service/moneo_publisher.service /etc/systemd/system/moneo_publisher.service + $MONEO_PATH/src/worker/install/install.sh azure_monitor +fi systemctl daemon-reload diff --git a/linux_service/geneva_publisher.service b/linux_service/geneva_publisher.service new file mode 100644 index 0000000..ee659d3 --- /dev/null +++ b/linux_service/geneva_publisher.service @@ -0,0 +1,13 @@ +[Unit] +Description=Moneo exporter service geneva +After=network.target + +[Service] +Type=simple +Restart=no +ExecStart=/usr/bin/python3 /tmp/moneo-worker/publisher/metrics_publisher.py geneva +User=root + + +[Install] +WantedBy=multi-user.target diff --git a/linux_service/moneo_publisher.service b/linux_service/moneo_publisher.service index 5bd019e..47bbe2d 100644 --- a/linux_service/moneo_publisher.service +++ b/linux_service/moneo_publisher.service @@ -1,5 +1,5 @@ [Unit] -Description=Moneo exporter service for %I +Description=Moneo exporter service for Azure Monitor After=network.target [Service] diff --git a/linux_service/start_moneo_services.sh b/linux_service/start_moneo_services.sh index 54f52f4..b5a35c3 100755 --- a/linux_service/start_moneo_services.sh +++ b/linux_service/start_moneo_services.sh @@ -1,11 +1,15 @@ #!/bin/bash -WITH_AZ_MON=$1 -# Two modes of operation managed prometheus (default) and az monitor (WITH_AZ_MON) enabled w/ WITH_AZ_MON=true -# User can modify this if they need to change -MONEO_PATH=/opt/azurehpc/tools/Moneo +# Usage: +# Managed Prometheus deployment: ./start_moneo_services.sh +# Azure Monitor: ./start_moneo_services.sh azure_monitor +# Geneva (internal msft): ./start_moneo_services.sh geneva +PublisherMethod=$1 +# Modify as necessary +MONEO_PATH=/opt/azurehpc/tools/Moneo +PUBLISHER_AUTH=umi # other choice is cert if [[ ! -d "$MONEO_PATH" ]]; then @@ -20,7 +24,13 @@ if lspci | grep -iq NVIDIA ; then procs+=("nvidia_exporter") fi -if [[ -n $WITH_AZ_MON && $WITH_AZ_MON = true ]]; then +if [[ -n $PublisherMethod ]]; then + if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then + echo "PublisherMethod is valid: $PublisherMethod" + else + echo "PublisherMethod is not one of the valid choices." + exit 1 + fi procs+=("metrics_publisher") fi @@ -59,7 +69,10 @@ systemctl start moneo@node_exporter.service systemctl start moneo@net_exporter.service systemctl start moneo@nvidia_exporter.service -if [[ -n $WITH_AZ_MON && $WITH_AZ_MON = true ]]; then +if [[ -n $PublisherMethod ]]; then + if [ "$PublisherMethod" == "geneva" ]; then + $MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config + fi sleep 5 # wait a bit for the exporters to start systemctl enable moneo_publisher.service systemctl start moneo_publisher.service diff --git a/linux_service/stop_moneo_services.sh b/linux_service/stop_moneo_services.sh index 19982dc..c56ef8b 100755 --- a/linux_service/stop_moneo_services.sh +++ b/linux_service/stop_moneo_services.sh @@ -12,6 +12,10 @@ systemctl disable moneo_publisher.service if [[ $(docker ps -a | grep prometheus) ]]; then echo "Stopping Prometheus containers" - docker stop prometheus - docker rm prometheus + docker stop prometheus genevamdmagent + docker rm prometheus genevamdmagent +elif [[ $(docker ps -a | grep genevamdmagent) ]]; then + docker stop genevamdmagent + docker rm genevamdmagent + fi diff --git a/src/worker/install/install.sh b/src/worker/install/install.sh index f922518..785ed8f 100755 --- a/src/worker/install/install.sh +++ b/src/worker/install/install.sh @@ -13,7 +13,22 @@ else source $(dirname "${BASH_SOURCE[0]}")/common.sh fi -python3 -m pip uninstall opentelemetry-sdk azure-monitor-opentelemetry opentelemetry-exporter-otlp opentelemetry-exporter-otlp-proto-grpc opentelemetry-exporter-otlp-proto-http -y +# uninstall to deal with Azure monitor and Geneva differences +python3 -m pip uninstall \ +azure-monitor-opentelemetry-exporter \ +opentelemetry-instrumentation \ +opentelemetry-api \ +opentelemetry-sdk \ +azure-monitor-opentelemetry \ +opentelemetry-exporter-otlp \ +opentelemetry-exporter-otlp-proto-grpc \ +opentelemetry-exporter-otlp-proto-http \ +opentelemetry-instrumentation-django \ +opentelemetry-instrumentation-flask \ +opentelemetry-instrumentation-requests \ +opentelemetry-instrumentation-wsgi \ +opentelemetry-instrumentation-dbapi \ +opentelemetry-instrumentation-psycopg2 -y if [ -n "$PUBLISHER_INSTALL" ]; then