Skip to content

Commit

Permalink
Update Moneo Exporter for MI300 (#81)
Browse files Browse the repository at this point in the history
* update moneo for mi300

* fix comments
  • Loading branch information
RyoYang authored Jun 4, 2024
1 parent 845ebe2 commit 891fc3e
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 32 deletions.
31 changes: 31 additions & 0 deletions dockerfile/moneo-exporter-amd.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Use ROCm development image
FROM rocm/dev-ubuntu-22.04:6.1.1

# Metadata
LABEL maintainer="Moneo"

# Environment variables
ENV ROCM_VERSION=6.1.1 \
DEBIAN_FRONTEND=noninteractive

# Work directory setup
WORKDIR /root/Moneo
COPY . .

# Installing packages and setting up Python
RUN apt-get update -y && apt-get install -y --no-install-recommends \
numactl git curl cmake ibverbs-utils sudo systemd wget libgomp1 libcap2-bin python3.10 python3-pip && \
cd /usr/bin && rm python3 && ln -s python3.10 python3 && \
python3 -m pip install --upgrade pip && \
python3 -m pip install prometheus_client psutil

# # RDC installation
WORKDIR /root/Moneo/src/worker
RUN sudo bash install/amd.sh

# Set EntryPoint
COPY dockerfile/moneo-exporter-amd_entrypoint.sh .
RUN chmod +x moneo-exporter-amd_entrypoint.sh

# Final CMD
CMD ["/bin/bash", "moneo-exporter-amd_entrypoint.sh"]
27 changes: 27 additions & 0 deletions dockerfile/moneo-exporter-amd_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
set -e

# Stops previous instances of AMD RDC Daemon and Exporter
bash shutdown.sh

# Launches AMD RDC Daemon
nohup /opt/rocm/bin/rdcd -u </dev/null >/dev/null 2>&1 &

# Initiates AMD and Network Exporters
echo "Starting AMD and Network Exporters"

# Starts AMD Exporter
python3 exporters/amd_exporter.py &
echo "AMD Exporter Started!"

# Starts Network Exporter with specified InfiniBand sysfs path
python3 exporters/net_exporter.py --inifiband_sysfs=/hostsys/class/infiniband &
echo "Network Exporter Started!"

# Starts Node Exporter
python3 exporters/node_exporter.py &
echo "Node Exporter Started!"

# Waits for any process to exit and returns the exit status
wait -n
exit $?
19 changes: 11 additions & 8 deletions src/worker/exporters/amd_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
sys.path.extend([
'/opt/rocm/libexec/rocm_smi', # ROCm >=5.2
'/opt/rocm/rocm_smi/bindings', # ROCm <5.2
'/opt/rocm/rdc/python_binding',
'/opt/rdc/python_binding',
])

from rsmiBindings import rocmsmi, rsmi_status_t
from rsmiBindings import *
from RdcReader import RdcReader
from rdc_bootstrap import * # noqa: F403

PRINT_JSON = True
rocmsmi = initRsmiBindings(silent=PRINT_JSON)

RDC_FIELDS = [
# PID
# rdc_field_t.RDC_FI_DEV_COMPUTE_PIDS,
Expand Down Expand Up @@ -51,8 +54,8 @@
# rdc_field_t.RDC_FI_PROF_NVLINK_TX_BYTES,
# rdc_field_t.RDC_FI_PROF_NVLINK_RX_BYTES,
# PCIe
rdc_field_t.RDC_FI_PCIE_TX,
rdc_field_t.RDC_FI_PCIE_RX,
# rdc_field_t.RDC_FI_PCIE_TX,
# rdc_field_t.RDC_FI_PCIE_RX,
]


Expand Down Expand Up @@ -82,18 +85,18 @@ def init_connection(self):
logging.info('Publishing fields: {}'.format(','.join(field_name_list)))

def init_gauges(self):
self.guages = {}
self.gauges = {}
for field_id in self.field_ids:
field_name = self.rdc_util.field_id_string(field_id).lower()
self.guages[field_id] = prometheus_client.Gauge(
self.gauges[field_id] = prometheus_client.Gauge(
'rdc_{}'.format(field_name),
'rdc_{}'.format(field_name),
['gpu_id', 'gpu_uuid'],
)

def handle_field(self, gpu_id, value):
if value.field_id.value in self.guages:
self.guages[value.field_id.value].labels(
if value.field_id.value in self.gauges:
self.gauges[value.field_id.value].labels(
gpu_id,
rdc_config['device_uuid'][gpu_id],
).set(value.value.l_int)
Expand Down
65 changes: 41 additions & 24 deletions src/worker/install/amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,46 @@ source ./$(dirname "${BASH_SOURCE[0]}")/common.sh
apt-get install -y automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev libc++-dev curl libcap-dev

# install grpc
export GRPC_LIB_DIR=/usr/local/lib
git clone -b v1.28.1 https://github.com/grpc/grpc /opt/grpc ||:
cd /opt/grpc
git submodule update --init
mkdir -p cmake/build
cd cmake/build
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON ../..
make -j
make install
echo ${GRPC_LIB_DIR} | tee /etc/ld.so.conf.d/grpc.conf
export GRPC_ROOT=/opt/grpc

# install RDC
export RDC_LIB_DIR=/opt/rocm/rdc/lib
git clone https://github.com/RadeonOpenCompute/rdc /opt/rdc ||:
mkdir -p /opt/rdc/build
cd /opt/rdc/build
cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="/usr/local" ..
make -j
make install
cat > /etc/ld.so.conf.d/x86_64-librdc_client.conf <<EOF
${GRPC_LIB_DIR}
${GRPC_LIB_DIR}64
${RDC_LIB_DIR}
${RDC_LIB_DIR}64
EOF
# Check if the directory exists and is not empty
if [ -d "$GRPC_ROOT" ] && [ "$(ls -A $GRPC_ROOT)" ]; then
cd "$GRPC_ROOT"
git pull
else
git clone -b v1.61.0 https://github.com/grpc/grpc --depth=1 --shallow-submodules --recurse-submodules "$GRPC_ROOT"
cd "$GRPC_ROOT"
fi
cmake -B build \
-DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_INSTALL_PREFIX="$GRPC_ROOT" \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_BUILD_TYPE=Release
make -C build -j $(nproc)
make -C build install
echo "$GRPC_ROOT" | sudo tee /etc/ld.so.conf.d/grpc.conf

# install rdc
export RDC_ROOT=/opt/rdc
# Check if the directory exists and is not empty
if [ -d "$RDC_ROOT" ] && [ "$(ls -A $RDC_ROOT)" ]; then
cd "$RDC_ROOT"
git pull
else
git clone --depth 1 --branch rocm-6.1.1 https://github.com/RadeonOpenCompute/rdc "$RDC_ROOT"
cd "$RDC_ROOT"
fi

# default installation location is /opt/rocm, specify with -DROCM_DIR or -DCMAKE_INSTALL_PREFIX
cmake -B build -DGRPC_ROOT="$GRPC_ROOT" -DROCM_DIR="/opt/rocm" -DCMAKE_INSTALL_PREFIX="/opt/rocm"
make -C build -j $(nproc)
make -C build install

# Update ldconfig
export RDC_LIB_DIR=/opt/rocm/lib/rdc
export GRPC_LIB_DIR=/opt/grpc/lib
echo -e "${GRPC_LIB_DIR}\n${GRPC_LIB_DIR}64" | sudo tee /etc/ld.so.conf.d/x86_64-librdc_client.conf
echo -e "${RDC_LIB_DIR}\n${RDC_LIB_DIR}64" | sudo tee -a /etc/ld.so.conf.d/x86_64-librdc_client.conf
ldconfig

0 comments on commit 891fc3e

Please sign in to comment.