Skip to content

Commit

Permalink
Update Moneo Exporter (#73)
Browse files Browse the repository at this point in the history
* update moneo exporter to have node exporter


* remove useless line

* make work_dir can be config in start_managed_prometheus.sh

* change to unless-stopped policy

---------

Co-authored-by: Ubuntu <yangwang1@yangwang1-00000A.fv005psvkzgurogsijonfbxuhd.jx.internal.cloudapp.net>
  • Loading branch information
RyoYang and Ubuntu authored Jan 2, 2024
1 parent 89abfbe commit 70c0a2d
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
include:
- name: moneo-exporter
dockerfile: moneo-exporter-nvidia
tags: azmoneo/moneo-exporter:nvidia
tags: azmoneo/moneo-exporter:latest
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
45 changes: 30 additions & 15 deletions dockerfile/moneo-exporter-nvidia.dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,44 @@
FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04
FROM nvcr.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04

LABEL maintainer="Moneo"

ARG BRANCH_OR_TAG=main

ENV DCGM_VERSION=3.1.1
ENV OFED_VERSION=23.07-0.5.1.2
ENV PROFILING false
ENV GPU_SAMPLE_RATE 2

# Install dependencies
RUN apt-get update -y \
&& apt-get install -y \
--no-install-recommends \
git \
curl \
sudo \
wget \
libgomp1 \
python3.8 \
RUN apt-get update -y \
&& apt-get install -y \
--no-install-recommends \
numactl \
git \
curl \
sudo \
systemd \
wget \
libgomp1 \
libcap2-bin \
datacenter-gpu-manager \
python3.10 \
python3-pip

# Link python3 to python3.8
RUN cd /usr/bin/ \
&& rm python3 \
&& ln -s /usr/bin/python3.8 python3
# Link python3 to python3.10
RUN cd /usr/bin/ \
&& rm python3 \
&& ln -s /usr/bin/python3.10 python3

RUN python3 -m pip install --upgrade pip

# Install OFED
RUN cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Clone Moneo repository
RUN git config --global advice.detachedHead false
RUN git clone --branch ${BRANCH_OR_TAG} https://github.com/Azure/Moneo.git
Expand All @@ -35,4 +50,4 @@ RUN sudo bash install/nvidia.sh
# Set EntryPoint
COPY dockerfile/moneo-exporter-nvidia_entrypoint.sh .
RUN chmod +x moneo-exporter-nvidia_entrypoint.sh
CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING}
CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} ${GPU_SAMPLE_RATE}
13 changes: 6 additions & 7 deletions dockerfile/moneo-exporter-nvidia_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,19 @@
set -e

enable_profiling=$1
# Start NVIDIA DCGM Daemon
echo "Starting NVIDIA DCGM Daemon"
nv-hostengine
gpu_sample_rate=$2

# Start NVIDIA and Net Exporter
echo "Starting NVIDIA and Net Exporter"
# Start NVIDIA, Net and Node Exporter
echo "Starting NVIDIA, Net and Node Exporter"

if [ $enable_profiling = true ]; then
python3 exporters/nvidia_exporter.py -m &
python3 exporters/nvidia_exporter.py -m -s $gpu_sample_rate &
else
python3 exporters/nvidia_exporter.py &
python3 exporters/nvidia_exporter.py -s $gpu_sample_rate &
fi

python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband &
python3 exporters/node_exporter.py &

wait -n
exit $?
28 changes: 27 additions & 1 deletion docs/Moneo-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,23 @@ docker run
--rm --runtime=nvidia
--net=host
-e PROFILING=<true/false>
-e GPU_SAMPLE_RATE=<gpu_sample_rate:(1,2,10)>
--cap-add SYS_ADMIN
-v /sys:/hostsys
-itd moneo-exporter-nvidia:latest
```
2. Check the port 8000 and 8001 is up, which is the moneo-exporter listening to:
2. Check the port 8000, 8001, 8002 is up, which is the moneo-exporter listening to:
```bash
root@azureuser:~$ sudo netstat -tulpn | grep LISTEN | grep python3
tcp 0 0 0.0.0.0:8000 0.0.0.0:* LISTEN 94787/python3
tcp 0 0 0.0.0.0:8001 0.0.0.0:* LISTEN 94788/python3
tcp 0 0 0.0.0.0:8002 0.0.0.0:* LISTEN 94789/python3
```
3. Get the prometheus metrics from Moneo-exporter.
```bash
curl localhost:8000
curl localhost:8001
curl localhost:8002
```
You can see the following prometheus metrics just as below, which means moneo-exporter can work normally.
```bash
Expand Down Expand Up @@ -99,4 +102,27 @@ ib_port_xmit_data{ib_port="mlx5_ib1:1",ib_sys_guid="********",job_id="None"} 0.0
ib_port_xmit_data{ib_port="mlx5_ib6:1",ib_sys_guid="********",job_id="None"} 0.0
ib_port_xmit_data{ib_port="mlx5_ib4:1",ib_sys_guid="********",job_id="None"} 0.0
...
root@azureuser:~$ curl localhost:8001
...
# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 104.0
python_gc_objects_collected_total{generation="1"} 304.0
python_gc_objects_collected_total{generation="2"} 0.0
# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP node_mem_available node_mem_available
# TYPE node_mem_available gauge
node_mem_available{job_id="None"} 1.841545956e+09
# HELP node_mem_util node_mem_util
# TYPE node_mem_util gauge
node_mem_util{job_id="None"} 0.9
# HELP node_xid_error node_xid_error
# TYPE node_xid_error gauge
# HELP node_link_flap node_link_flap
# TYPE node_link_flap gauge
...
```
2 changes: 1 addition & 1 deletion src/worker/deploy_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ if [ -e "/dev/nvidiactl" ]; then
docker pull $IMAGE

docker rm --force $CONT_NAME && \
docker run --name=$CONT_NAME --net=host \
docker run --name=$CONT_NAME --net=host --restart=unless-stopped \
-e PROFILING=$PROFILING --rm --runtime=nvidia \
--cap-add SYS_ADMIN -v /sys:/hostsys/ -itd $IMAGE
else
Expand Down
3 changes: 2 additions & 1 deletion src/worker/start_managed_prometheus.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

INSTANCE_NAME=$(hostname)
WORK_DIR=/tmp/moneo-worker
WORK_DIR="${1:-/tmp/moneo-worker}"
PROM_CONFIG=$WORK_DIR/prometheus.yml
CONFIG_DIR=$WORK_DIR/publisher/config
MANAGED_PROM_CONFIG=$CONFIG_DIR/managed_prom_config.json
Expand Down Expand Up @@ -63,6 +63,7 @@ mkdir -m 777 /mnt/prometheus
docker rm -f prometheus || true
docker run --name prometheus \
-it --net=host -d -p 9090:9090 \
--restart=unless-stopped \
-v /mnt/prometheus:/prometheus \
-v $PROM_CONFIG:/etc/prometheus/prometheus.yml \
prom/prometheus \
Expand Down

0 comments on commit 70c0a2d

Please sign in to comment.