diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 926c6de..82a9f30 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -21,7 +21,7 @@ jobs: include: - name: moneo-exporter dockerfile: moneo-exporter-nvidia - tags: azmoneo/moneo-exporter:nvidia + tags: azmoneo/moneo-exporter:latest steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockerfile/moneo-exporter-nvidia.dockerfile b/dockerfile/moneo-exporter-nvidia.dockerfile index 516fba2..7e96f37 100644 --- a/dockerfile/moneo-exporter-nvidia.dockerfile +++ b/dockerfile/moneo-exporter-nvidia.dockerfile @@ -1,29 +1,44 @@ -FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04 +FROM nvcr.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04 LABEL maintainer="Moneo" ARG BRANCH_OR_TAG=main + +ENV DCGM_VERSION=3.1.1 +ENV OFED_VERSION=23.07-0.5.1.2 ENV PROFILING false +ENV GPU_SAMPLE_RATE 2 # Install dependencies -RUN apt-get update -y \ - && apt-get install -y \ - --no-install-recommends \ - git \ - curl \ - sudo \ - wget \ - libgomp1 \ - python3.8 \ +RUN apt-get update -y \ + && apt-get install -y \ + --no-install-recommends \ + numactl \ + git \ + curl \ + sudo \ + systemd \ + wget \ + libgomp1 \ + libcap2-bin \ + datacenter-gpu-manager \ + python3.10 \ python3-pip -# Link python3 to python3.8 -RUN cd /usr/bin/ \ - && rm python3 \ - && ln -s /usr/bin/python3.8 python3 +# Link python3 to python3.10 +RUN cd /usr/bin/ \ + && rm python3 \ + && ln -s /usr/bin/python3.10 python3 RUN python3 -m pip install --upgrade pip +# Install OFED +RUN cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + # Clone Moneo repository RUN git config --global advice.detachedHead false RUN git clone --branch ${BRANCH_OR_TAG} https://github.com/Azure/Moneo.git @@ -35,4 +50,4 @@ RUN sudo bash install/nvidia.sh # Set EntryPoint COPY dockerfile/moneo-exporter-nvidia_entrypoint.sh . RUN chmod +x moneo-exporter-nvidia_entrypoint.sh -CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} +CMD /bin/bash moneo-exporter-nvidia_entrypoint.sh ${PROFILING} ${GPU_SAMPLE_RATE} diff --git a/dockerfile/moneo-exporter-nvidia_entrypoint.sh b/dockerfile/moneo-exporter-nvidia_entrypoint.sh index 9bd0dfa..5c5a453 100755 --- a/dockerfile/moneo-exporter-nvidia_entrypoint.sh +++ b/dockerfile/moneo-exporter-nvidia_entrypoint.sh @@ -2,20 +2,19 @@ set -e enable_profiling=$1 -# Start NVIDIA DCGM Daemon -echo "Starting NVIDIA DCGM Daemon" -nv-hostengine +gpu_sample_rate=$2 -# Start NVIDIA and Net Exporter -echo "Starting NVIDIA and Net Exporter" +# Start NVIDIA, Net and Node Exporter +echo "Starting NVIDIA, Net and Node Exporter" if [ $enable_profiling = true ]; then - python3 exporters/nvidia_exporter.py -m & + python3 exporters/nvidia_exporter.py -m -s $gpu_sample_rate & else - python3 exporters/nvidia_exporter.py & + python3 exporters/nvidia_exporter.py -s $gpu_sample_rate & fi python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband & +python3 exporters/node_exporter.py & wait -n exit $? diff --git a/docs/Moneo-exporter.md b/docs/Moneo-exporter.md index 0afc253..090f9ab 100644 --- a/docs/Moneo-exporter.md +++ b/docs/Moneo-exporter.md @@ -56,20 +56,23 @@ docker run --rm --runtime=nvidia --net=host -e PROFILING= + -e GPU_SAMPLE_RATE= --cap-add SYS_ADMIN -v /sys:/hostsys -itd moneo-exporter-nvidia:latest ``` -2. Check the port 8000 and 8001 is up, which is the moneo-exporter listening to: +2. Check the port 8000, 8001, 8002 is up, which is the moneo-exporter listening to: ```bash root@azureuser:~$ sudo netstat -tulpn | grep LISTEN | grep python3 tcp 0 0 0.0.0.0:8000 0.0.0.0:* LISTEN 94787/python3 tcp 0 0 0.0.0.0:8001 0.0.0.0:* LISTEN 94788/python3 +tcp 0 0 0.0.0.0:8002 0.0.0.0:* LISTEN 94789/python3 ``` 3. Get the prometheus metrics from Moneo-exporter. ```bash curl localhost:8000 curl localhost:8001 +curl localhost:8002 ``` You can see the following prometheus metrics just as below, which means moneo-exporter can work normally. ```bash @@ -99,4 +102,27 @@ ib_port_xmit_data{ib_port="mlx5_ib1:1",ib_sys_guid="********",job_id="None"} 0.0 ib_port_xmit_data{ib_port="mlx5_ib6:1",ib_sys_guid="********",job_id="None"} 0.0 ib_port_xmit_data{ib_port="mlx5_ib4:1",ib_sys_guid="********",job_id="None"} 0.0 ... +root@azureuser:~$ curl localhost:8001 +... +# HELP python_gc_objects_collected_total Objects collected during gc +# TYPE python_gc_objects_collected_total counter +python_gc_objects_collected_total{generation="0"} 104.0 +python_gc_objects_collected_total{generation="1"} 304.0 +python_gc_objects_collected_total{generation="2"} 0.0 +# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC +# TYPE python_gc_objects_uncollectable_total counter +python_gc_objects_uncollectable_total{generation="0"} 0.0 +python_gc_objects_uncollectable_total{generation="1"} 0.0 +python_gc_objects_uncollectable_total{generation="2"} 0.0 +# HELP node_mem_available node_mem_available +# TYPE node_mem_available gauge +node_mem_available{job_id="None"} 1.841545956e+09 +# HELP node_mem_util node_mem_util +# TYPE node_mem_util gauge +node_mem_util{job_id="None"} 0.9 +# HELP node_xid_error node_xid_error +# TYPE node_xid_error gauge +# HELP node_link_flap node_link_flap +# TYPE node_link_flap gauge +... ``` \ No newline at end of file diff --git a/src/worker/deploy_docker.sh b/src/worker/deploy_docker.sh index 3adafa8..26e1f3a 100755 --- a/src/worker/deploy_docker.sh +++ b/src/worker/deploy_docker.sh @@ -8,7 +8,7 @@ if [ -e "/dev/nvidiactl" ]; then docker pull $IMAGE docker rm --force $CONT_NAME && \ - docker run --name=$CONT_NAME --net=host \ + docker run --name=$CONT_NAME --net=host --restart=unless-stopped \ -e PROFILING=$PROFILING --rm --runtime=nvidia \ --cap-add SYS_ADMIN -v /sys:/hostsys/ -itd $IMAGE else diff --git a/src/worker/start_managed_prometheus.sh b/src/worker/start_managed_prometheus.sh index 05172cf..4e62435 100755 --- a/src/worker/start_managed_prometheus.sh +++ b/src/worker/start_managed_prometheus.sh @@ -1,7 +1,7 @@ #!/bin/bash INSTANCE_NAME=$(hostname) -WORK_DIR=/tmp/moneo-worker +WORK_DIR="${1:-/tmp/moneo-worker}" PROM_CONFIG=$WORK_DIR/prometheus.yml CONFIG_DIR=$WORK_DIR/publisher/config MANAGED_PROM_CONFIG=$CONFIG_DIR/managed_prom_config.json @@ -63,6 +63,7 @@ mkdir -m 777 /mnt/prometheus docker rm -f prometheus || true docker run --name prometheus \ -it --net=host -d -p 9090:9090 \ + --restart=unless-stopped \ -v /mnt/prometheus:/prometheus \ -v $PROM_CONFIG:/etc/prometheus/prometheus.yml \ prom/prometheus \