From 891fc3ed84db7c40bcb193568d25ae52e67b9e93 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 4 Jun 2024 09:41:45 +0800 Subject: [PATCH] Update Moneo Exporter for MI300 (#81) * update moneo for mi300 * fix comments --- dockerfile/moneo-exporter-amd.dockerfile | 31 ++++++++++ dockerfile/moneo-exporter-amd_entrypoint.sh | 27 +++++++++ src/worker/exporters/amd_exporter.py | 19 +++--- src/worker/install/amd.sh | 65 +++++++++++++-------- 4 files changed, 110 insertions(+), 32 deletions(-) create mode 100644 dockerfile/moneo-exporter-amd.dockerfile create mode 100755 dockerfile/moneo-exporter-amd_entrypoint.sh diff --git a/dockerfile/moneo-exporter-amd.dockerfile b/dockerfile/moneo-exporter-amd.dockerfile new file mode 100644 index 0000000..e131926 --- /dev/null +++ b/dockerfile/moneo-exporter-amd.dockerfile @@ -0,0 +1,31 @@ +# Use ROCm development image +FROM rocm/dev-ubuntu-22.04:6.1.1 + +# Metadata +LABEL maintainer="Moneo" + +# Environment variables +ENV ROCM_VERSION=6.1.1 \ + DEBIAN_FRONTEND=noninteractive + +# Work directory setup +WORKDIR /root/Moneo +COPY . . + +# Installing packages and setting up Python +RUN apt-get update -y && apt-get install -y --no-install-recommends \ + numactl git curl cmake ibverbs-utils sudo systemd wget libgomp1 libcap2-bin python3.10 python3-pip && \ + cd /usr/bin && rm python3 && ln -s python3.10 python3 && \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install prometheus_client psutil + +# # RDC installation +WORKDIR /root/Moneo/src/worker +RUN sudo bash install/amd.sh + +# Set EntryPoint +COPY dockerfile/moneo-exporter-amd_entrypoint.sh . +RUN chmod +x moneo-exporter-amd_entrypoint.sh + +# Final CMD +CMD ["/bin/bash", "moneo-exporter-amd_entrypoint.sh"] diff --git a/dockerfile/moneo-exporter-amd_entrypoint.sh b/dockerfile/moneo-exporter-amd_entrypoint.sh new file mode 100755 index 0000000..50112a8 --- /dev/null +++ b/dockerfile/moneo-exporter-amd_entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# Stops previous instances of AMD RDC Daemon and Exporter +bash shutdown.sh + +# Launches AMD RDC Daemon +nohup /opt/rocm/bin/rdcd -u /dev/null 2>&1 & + +# Initiates AMD and Network Exporters +echo "Starting AMD and Network Exporters" + +# Starts AMD Exporter +python3 exporters/amd_exporter.py & +echo "AMD Exporter Started!" + +# Starts Network Exporter with specified InfiniBand sysfs path +python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband & +echo "Network Exporter Started!" + +# Starts Node Exporter +python3 exporters/node_exporter.py & +echo "Node Exporter Started!" + +# Waits for any process to exit and returns the exit status +wait -n +exit $? diff --git a/src/worker/exporters/amd_exporter.py b/src/worker/exporters/amd_exporter.py index e1122ef..71bebc0 100644 --- a/src/worker/exporters/amd_exporter.py +++ b/src/worker/exporters/amd_exporter.py @@ -10,13 +10,16 @@ sys.path.extend([ '/opt/rocm/libexec/rocm_smi', # ROCm >=5.2 '/opt/rocm/rocm_smi/bindings', # ROCm <5.2 - '/opt/rocm/rdc/python_binding', + '/opt/rdc/python_binding', ]) -from rsmiBindings import rocmsmi, rsmi_status_t +from rsmiBindings import * from RdcReader import RdcReader from rdc_bootstrap import * # noqa: F403 +PRINT_JSON = True +rocmsmi = initRsmiBindings(silent=PRINT_JSON) + RDC_FIELDS = [ # PID # rdc_field_t.RDC_FI_DEV_COMPUTE_PIDS, @@ -51,8 +54,8 @@ # rdc_field_t.RDC_FI_PROF_NVLINK_TX_BYTES, # rdc_field_t.RDC_FI_PROF_NVLINK_RX_BYTES, # PCIe - rdc_field_t.RDC_FI_PCIE_TX, - rdc_field_t.RDC_FI_PCIE_RX, + # rdc_field_t.RDC_FI_PCIE_TX, + # rdc_field_t.RDC_FI_PCIE_RX, ] @@ -82,18 +85,18 @@ def init_connection(self): logging.info('Publishing fields: {}'.format(','.join(field_name_list))) def init_gauges(self): - self.guages = {} + self.gauges = {} for field_id in self.field_ids: field_name = self.rdc_util.field_id_string(field_id).lower() - self.guages[field_id] = prometheus_client.Gauge( + self.gauges[field_id] = prometheus_client.Gauge( 'rdc_{}'.format(field_name), 'rdc_{}'.format(field_name), ['gpu_id', 'gpu_uuid'], ) def handle_field(self, gpu_id, value): - if value.field_id.value in self.guages: - self.guages[value.field_id.value].labels( + if value.field_id.value in self.gauges: + self.gauges[value.field_id.value].labels( gpu_id, rdc_config['device_uuid'][gpu_id], ).set(value.value.l_int) diff --git a/src/worker/install/amd.sh b/src/worker/install/amd.sh index fae0659..0d2d0a5 100644 --- a/src/worker/install/amd.sh +++ b/src/worker/install/amd.sh @@ -7,29 +7,46 @@ source ./$(dirname "${BASH_SOURCE[0]}")/common.sh apt-get install -y automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev libc++-dev curl libcap-dev # install grpc -export GRPC_LIB_DIR=/usr/local/lib -git clone -b v1.28.1 https://github.com/grpc/grpc /opt/grpc ||: -cd /opt/grpc -git submodule update --init -mkdir -p cmake/build -cd cmake/build -cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON ../.. -make -j -make install -echo ${GRPC_LIB_DIR} | tee /etc/ld.so.conf.d/grpc.conf +export GRPC_ROOT=/opt/grpc -# install RDC -export RDC_LIB_DIR=/opt/rocm/rdc/lib -git clone https://github.com/RadeonOpenCompute/rdc /opt/rdc ||: -mkdir -p /opt/rdc/build -cd /opt/rdc/build -cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="/usr/local" .. -make -j -make install -cat > /etc/ld.so.conf.d/x86_64-librdc_client.conf <