Skip to content

Commit

Permalink
Geneva, docs, and fixes (#69)
Browse files Browse the repository at this point in the history
* modify services for geneva

* add stopping of Geneva containr

* wrapping up some additional changes before release

* remove geneva tmp files

* addressed PR comments

* addressing PR comments

---------

Co-authored-by: Ubuntu <rafsalas@a100vm.dnkq5svzo1wedbjjy0q5ykz5bb.bx.internal.cloudapp.net>
Co-authored-by: Ubuntu <rafsalas@genevatestvm.1dqaosvq5e2u1cyy22xmqzfure.jx.internal.cloudapp.net>
  • Loading branch information
3 people authored Sep 22, 2023
1 parent 5f9877a commit 754ac94
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 54 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ Note: For more options check the Moneo help menu
- Slurm epilog/prolog integration: [Slurm example](./examples/slurm/README.md)
- To deploy moneo-worker inside container: [Moneo-exporter](./docs/Moneo-exporter.md)
- To integrate Moneo with Azure App Insights dashboard see: [Azure Monitor](./docs/AzureMonitorAgent.md)
- For Geneva ingestion (internal Microsoft) see: [Geneva](./docs/GenevaAgent.MD)

## Known Issues ##

Expand Down
56 changes: 28 additions & 28 deletions deploy_managed_infra/managed_infra_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,92 +99,92 @@
{
"record": "average_dcgm_gpu_temp",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_gpu_temp",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_gpu_temp",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_gpu_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_dcgm_memory_temp",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_memory_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_memory_temp",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_memory_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_memory_temp",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_memory_temp) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_dcgm_sm_clock",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_sm_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_sm_clock",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_sm_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_sm_clock",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_sm_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_dcgm_memory_clock",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_memory_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_memory_clock",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_memory_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_memory_clock",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_memory_clock) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_dcgm_gpu_utilization",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_gpu_utilization",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_gpu_utilization",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_gpu_utilization) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_dcgm_mem_copy_utilization",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_mem_copy_utilization",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_mem_copy_utilization",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_mem_copy_utilization) by (instance, subscription, cluster, job_id)"
}
],
"interval": "PT1M"
Expand Down Expand Up @@ -213,7 +213,7 @@
{
"record": "max_dcgm_power_usage",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_power_usage",
Expand All @@ -223,47 +223,47 @@
{
"record": "average_dcgm_total_energy_consumption",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_dcgm_total_energy_consumption",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_dcgm_total_energy_consumption",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(dcgm_total_energy_consumption) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_ib_port_xmit_data",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(ib_port_xmit_data) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_ib_port_xmit_data",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(ib_port_xmit_data) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_ib_port_xmit_data",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(ib_port_xmit_data) by (instance, subscription, cluster, job_id)"
},
{
"record": "average_ib_port_rcv_data",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "avg(ib_port_rcv_data) by (instance, subscription, cluster, job_id)"
},
{
"record": "max_ib_port_rcv_data",
"enabled": true,
"expression": "avg(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "max(ib_port_rcv_data) by (instance, subscription, cluster, job_id)"
},
{
"record": "min_ib_port_rcv_data",
"enabled": true,
"expression": "min(dcgm_power_usage) by (instance, subscription, cluster, job_id)"
"expression": "min(ib_port_rcv_data) by (instance, subscription, cluster, job_id)"
}
],
"interval": "PT1M"
Expand Down
19 changes: 12 additions & 7 deletions docs/GenevaAgent.MD
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
Geneva Agent User Guide
=====
Description
-----
# Geneva Agent User Guide #

## Description ##

This guide will walk you through how to share your exporter metrics with Azure using the Geneva Agent.
Note: Currently this feature is private preview and is turned off by default. As of now only internal Microsoft Azure subscriptions are whitelisted, and we recommend you use azure monitor instead, Detailed please refer to this doc: [Azure Monitor Agent](AzureMonitorAgent.md)

Prequisites:
## Prequisites ##

1. Prepare Authentication for geneva agent:

a. User Managed Identity (umi)
Expand All @@ -19,23 +20,27 @@ Prequisites:

b. Certificate (cert)
- Create Private key pem and public key pem files by:

```bash
openssl genrsa 2048 > mdm-key.pem
openssl req -x509 -new -key gcskey.pem -out mdm-cert.pem
```

- Print out and get the certicate thumbprint:

```
openssl x509 -in mdm-cert.pem -noout -sha1 -fingerprint
```

- Replace above key-cert pairs with `src/worker/publisher/config/mdm-key.pem` and `src/worker/publisher/config/mdm-cert.pem`
- Register the thumbprint on geneva portal with a metricsPublisher role.
2. Currently the only supported OS's are Ubuntu 20.04+ and Mariner.
3. python3-dev installed on all nodes.
4. PSSH installed on manager nodes.
5. Ensure passwordless ssh is installed in you environment.
Steps
-----
## Steps ##
1. Ensure that all prequisites are met.
2. deploy Moneo
- Full deployment with umi auth:
Expand Down
16 changes: 11 additions & 5 deletions linux_service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Three launch methods provided:
3. Launch exporters and an [Azure Monitor](../docs/AzureMonitorAgent.md) publisher.
- Before launch you must modify the "azure_monitor_agent_config" section of [publisher_config](../src/worker/publisher/config/publisher_config.json) file with the Azure Monitor workspace connection string.

There is one additional method for internal Msft use that exports to Geneva. This method is similar to Azure Monitor method but uses a Geneva agent container to export. Reference the [Moneo Geneva Docs](../docs/GenevaAgent.MD). Ensure all prequisites are met.

This guide will walk you through how to set up Linux services for Moneo exporters.

## Prerequisites ##
Expand All @@ -39,11 +41,12 @@ Below are the prereqs needed:

### Configuration and Installation ###

Configuration/Installation is only required once. Afte that is complete the Linux services can be started and stopped as desired.
Configuration/Installation is only required once. After that is complete the Linux services can be started and stopped as desired.

1. Configuration and installation of the Linux service is done with the following command:
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"```
- If You will only be launching the exporters without AZ monitor or Managed Prometheus Continue to the Launch Services section else continue.
- Note: If using Azure monitor or Geneva add an extra argument "./start_moneo_services.sh azure_monitor" or "./configure_service.sh geneva" respectively.
- Note: Geneva authentication is user managed identity "umi" by default, you can choose to change to "cert" method by modifiying [the start script](./configure_service.sh) "PUBLISHER_AUTH" variable.

2. For Azure Monitor or Managed Prometheus methods if you have not yet modified the configuration files reference the following:
- For Azure Managed Prometheus:
Expand All @@ -58,9 +61,11 @@ Configuration/Installation is only required once. Afte that is complete the Linu

The [start_moneo_services.sh](./start_moneo_services.sh) script is used to start the Linux services once configuration/installation is complete.

#### Exporters with Azure Monitor ####
#### Exporters with Azure Monitor or Geneva(internal Msft) ####

```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh true"```
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh azure_monitor"```
or
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh geneva"```

#### Exporters with Managed Prometheus ####

Expand All @@ -75,12 +80,13 @@ Stopping services is the same command for all methods.

Assuming configuration files have been updated and user managed ID applied if necessary (Managed Prometheus) reference these commands for the work flow:

- Configuration/Install:
- Configuration/Install:
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/configure_service.sh"```
- Extra Configure step for AZ Monitor and/or Managed Prometheus
```parallel-scp -h hostfile /opt/azurehpc/tools/Moneo/src/worker/publisher/config/<Respective config file> /opt/azurehpc/tools/Moneo/src/worker/publisher/config```
- Start
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/start_moneo_services.sh"```
Note:
- Stop
```parallel-ssh -i -t 0 -h hostfile "sudo /opt/azurehpc/tools/Moneo/linux_service/stop_moneo_services.sh"```

Expand Down
28 changes: 24 additions & 4 deletions linux_service/configure_service.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
#!/bin/bash

# Managed Prometheus deployment: ./configure_service.sh
# Azure Monitor: ./configure_service.sh azure_monitor
# Geneva (internal msft): ./configure_service.sh geneva

PublisherMethod=$1

if [[ -n $PublisherMethod ]]; then
if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then
echo "PublisherMethod is valid: $PublisherMethod"
else
echo "PublisherMethod $PublisherMethod is not one of the valid choices {azure_monitor, geneva}."
exit 1
fi
fi

MONEO_PATH=/opt/azurehpc/tools/Moneo

if [[ ! -d "$MONEO_PATH" ]];
Expand All @@ -9,11 +24,16 @@ then
fi

# replace the moneo path place holder with actaul moneo path and Move service file to systemd directory
sed "s#<Moneo_Path>#$MONEO_PATH#g" $MONEO_PATH/linux_service/[email protected] > /etc/systemd/system/[email protected]
cp $MONEO_PATH/linux_service/[email protected] /etc/systemd/system/[email protected]

echo "configuring publisher service"
sed "s#<Moneo_Path>#$MONEO_PATH#g;" $MONEO_PATH/linux_service/moneo_publisher.service > /etc/systemd/system/moneo_publisher.service

$MONEO_PATH/src/worker/install/install.sh azure_monitor
if [[ "$PublisherMethod" == "geneva" ]]; then
# writes to the same file location as Azure monitor
cp $MONEO_PATH/linux_service/geneva_publisher.service /etc/systemd/system/moneo_publisher.service
$MONEO_PATH/src/worker/install/install.sh geneva
else
cp $MONEO_PATH/linux_service/moneo_publisher.service /etc/systemd/system/moneo_publisher.service
$MONEO_PATH/src/worker/install/install.sh azure_monitor
fi

systemctl daemon-reload
13 changes: 13 additions & 0 deletions linux_service/geneva_publisher.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Unit]
Description=Moneo exporter service geneva
After=network.target

[Service]
Type=simple
Restart=no
ExecStart=/usr/bin/python3 /tmp/moneo-worker/publisher/metrics_publisher.py geneva
User=root


[Install]
WantedBy=multi-user.target
2 changes: 1 addition & 1 deletion linux_service/moneo_publisher.service
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Moneo exporter service for %I
Description=Moneo exporter service for Azure Monitor
After=network.target

[Service]
Expand Down
25 changes: 19 additions & 6 deletions linux_service/start_moneo_services.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/bin/bash
WITH_AZ_MON=$1

# Two modes of operation managed prometheus (default) and az monitor (WITH_AZ_MON) enabled w/ WITH_AZ_MON=true

# User can modify this if they need to change
MONEO_PATH=/opt/azurehpc/tools/Moneo
# Usage:
# Managed Prometheus deployment: ./start_moneo_services.sh
# Azure Monitor: ./start_moneo_services.sh azure_monitor
# Geneva (internal msft): ./start_moneo_services.sh geneva
PublisherMethod=$1

# Modify as necessary
MONEO_PATH=/opt/azurehpc/tools/Moneo
PUBLISHER_AUTH=umi # other choice is cert

if [[ ! -d "$MONEO_PATH" ]];
then
Expand All @@ -20,7 +24,13 @@ if lspci | grep -iq NVIDIA ; then
procs+=("nvidia_exporter")
fi

if [[ -n $WITH_AZ_MON && $WITH_AZ_MON = true ]]; then
if [[ -n $PublisherMethod ]]; then
if [ "$PublisherMethod" == "geneva" ] || [ "$PublisherMethod" == "azure_monitor" ]; then
echo "PublisherMethod is valid: $PublisherMethod"
else
echo "PublisherMethod is not one of the valid choices."
exit 1
fi
procs+=("metrics_publisher")
fi

Expand Down Expand Up @@ -59,7 +69,10 @@ systemctl start moneo@node_exporter.service
systemctl start moneo@net_exporter.service
systemctl start moneo@nvidia_exporter.service

if [[ -n $WITH_AZ_MON && $WITH_AZ_MON = true ]]; then
if [[ -n $PublisherMethod ]]; then
if [ "$PublisherMethod" == "geneva" ]; then
$MONEO_PATH/src/worker/start_geneva.sh $PUBLISHER_AUTH /tmp/moneo-worker/publisher/config
fi
sleep 5 # wait a bit for the exporters to start
systemctl enable moneo_publisher.service
systemctl start moneo_publisher.service
Expand Down
Loading

0 comments on commit 754ac94

Please sign in to comment.