Skip to content

Commit

Permalink
Monitor - Upgrade pyrsmi to amdsmi python library. (#601)
Browse files Browse the repository at this point in the history
**Description**
Upgrade to amdsmi python library since pyrsmi will be retired as AMD
guys suggested:

AMD SMI Python Library:
https://github.com/ROCm/amdsmi/tree/develop/py-interface
pyrsmi: https://github.com/RadeonOpenCompute/pyrsmi
  • Loading branch information
guoshzhao authored and abuccts committed Jan 3, 2024
1 parent b32b34c commit b150dcc
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 27 deletions.
4 changes: 4 additions & 0 deletions dockerfile/rocm5.7.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ RUN cd /opt/ && \
.. && \
make -j${NUM_MAKE_JOBS}

# Install AMD SMI Python Library
RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip install --user .

ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
Expand Down
78 changes: 51 additions & 27 deletions superbench/common/utils/device_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
import py3nvml.py3nvml as nvml
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
from pyrsmi import rocml
import amdsmi as rocml


class DeviceManager:
Expand Down Expand Up @@ -150,7 +150,7 @@ def get_device_compute_capability(self):
try:
cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
except Exception as err:
logger.error('Get device compute capability failed: {}'.format(str(err)))
logger.warning('Get device compute capability failed: {}'.format(str(err)))
return None
return cap

Expand All @@ -166,7 +166,7 @@ def get_device_utilization(self, idx):
try:
util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err)))
logger.warning('Get device utilization failed: {}'.format(str(err)))
return None
return util.gpu

Expand All @@ -182,7 +182,7 @@ def get_device_temperature(self, idx):
try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err:
logger.error('Get device temperature failed: {}'.format(str(err)))
logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp

Expand All @@ -198,7 +198,7 @@ def get_device_power(self, idx):
try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
except Exception as err:
logger.error('Get device power failed: {}'.format(str(err)))
logger.warning('Get device power failed: {}'.format(str(err)))
return None
return int(int(power) / 1000)

Expand All @@ -214,7 +214,7 @@ def get_device_power_limit(self, idx):
try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
except Exception as err:
logger.error('Get device power limitation failed: {}'.format(str(err)))
logger.warning('Get device power limitation failed: {}'.format(str(err)))
return None
return int(int(powerlimit) / 1000)

Expand All @@ -231,7 +231,7 @@ def get_device_memory(self, idx):
try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err)))
logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None
return mem.used, mem.total

Expand Down Expand Up @@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError:
pass
except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err)))
logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None

try:
Expand All @@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError:
pass
except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err)))
logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None

return corrected_ecc, uncorrected_ecc
Expand All @@ -326,20 +326,21 @@ class AmdDeviceManager(DeviceManager):
"""Device management module for AMD."""
def __init__(self):
"""Constructor."""
rocml.smi_initialize()
rocml.amdsmi_init()
self._device_handlers = rocml.amdsmi_get_processor_handles()
super().__init__()

def __del__(self):
"""Destructor."""
rocml.smi_shutdown()
rocml.amdsmi_shut_down()

def get_device_count(self):
"""Get the number of device.
Return:
count (int): count of device.
"""
return rocml.smi_get_device_count()
return len(self._device_handlers)

def get_device_utilization(self, idx):
"""Get the utilization of device.
Expand All @@ -351,11 +352,11 @@ def get_device_utilization(self, idx):
util (int): the utilization of device, None means failed to get the data.
"""
try:
util = rocml.smi_get_device_utilization(idx)
engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx])
except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err)))
logger.warning('Get device utilization failed: {}'.format(str(err)))
return None
return util
return engine_usage['gfx_activity']

def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius.
Expand All @@ -366,8 +367,16 @@ def get_device_temperature(self, idx):
Return:
temp (int): the temperature of device, None means failed to get the data.
"""
# Currently no API provided in rocml.
return None
try:
temp = rocml.amdsmi_get_temp_metric(
self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
)
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp

def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt.
Expand All @@ -379,11 +388,11 @@ def get_device_power(self, idx):
temp (int): the realtime power of device, None means failed to get the data.
"""
try:
power = rocml.smi_get_device_average_power(idx)
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err:
logger.error('Get device power failed: {}'.format(str(err)))
logger.warning('Get device power failed: {}'.format(str(err)))
return None
return int(int(power) / 1000)
return int(power_measure['average_socket_power'])

def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Expand All @@ -394,8 +403,12 @@ def get_device_power_limit(self, idx):
Return:
temp (int): the power management limit of device, None means failed to get the data.
"""
# Currently no API provided in rocml.
return None
try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None
return int(power_measure['power_limit'])

def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Expand All @@ -408,10 +421,10 @@ def get_device_memory(self, idx):
total (int): the total device memory in bytes, None means failed to get the data.
"""
try:
mem_used = rocml.smi_get_device_memory_used(idx)
mem_total = rocml.smi_get_device_memory_total(idx)
mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err)))
logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None
return mem_used, mem_total

Expand All @@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx):
corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error.
"""
# Currently no API provided in rocml.
return None, None
corrected_ecc = 0
uncorrected_ecc = 0
for block in rocml.AmdSmiGpuBlock:
try:
ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block)
corrected_ecc += ecc_count['correctable_count']
uncorrected_ecc += ecc_count['uncorrectable_count']
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.info('Get device ECC information failed: {}'.format(str(err)))

return corrected_ecc, uncorrected_ecc


device_manager: Optional[DeviceManager] = DeviceManager()
Expand Down

0 comments on commit b150dcc

Please sign in to comment.