diff --git a/setup.py b/setup.py index a05dcfda4..027add3eb 100644 --- a/setup.py +++ b/setup.py @@ -183,7 +183,7 @@ def run(self): **x, 'develop': x['dev'] + x['test'], 'cpuworker': x['torch'], - 'amdworker': x['torch'] + x['ort'], + 'amdworker': x['torch'] + x['ort'] + x['amd'], 'nvworker': x['torch'] + x['ort'] + x['nvidia'], } )( @@ -217,6 +217,7 @@ def run(self): 'onnxruntime-gpu; python_version>="3.10"', ], 'nvidia': ['py3nvml>=0.2.6'], + 'amd': ['pyrsmi>=1.0.1'], } ), include_package_data=True, diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 2a6a8a889..09398cac0 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -3,24 +3,138 @@ """Device Managerment Library Utility.""" -import py3nvml.py3nvml as nvml +from typing import Optional from superbench.common.utils import logger from superbench.common.utils import process +from superbench.common.devices import GPU + +gpu = GPU() +if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': + import py3nvml.py3nvml as nvml +elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': + from pyrsmi import rocml class DeviceManager: - """Device management module.""" + """Device management base module.""" def __init__(self): """Constructor.""" - nvml.nvmlInit() self._device_count = self.get_device_count() + + def get_device_count(self): + """Get the number of device. + + Return: + count (int): count of device. + """ + return 0 + + def get_device_compute_capability(self): + """Get the compute capability of device. + + Return: + cap (float): the compute capability of device, None means failed to get the data. + """ + return None + + def get_device_utilization(self, idx): + """Get the utilization of device. + + Args: + idx (int): device index. + + Return: + util (int): the utilization of device, None means failed to get the data. + """ + return None + + def get_device_temperature(self, idx): + """Get the temperature of device, unit: celsius. + + Args: + idx (int): device index. + + Return: + temp (int): the temperature of device, None means failed to get the data. + """ + return None + + def get_device_power(self, idx): + """Get the realtime power of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the realtime power of device, None means failed to get the data. + """ + return None + + def get_device_power_limit(self, idx): + """Get the power management limit of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the power management limit of device, None means failed to get the data. + """ + return None + + def get_device_memory(self, idx): + """Get the memory information of device, unit: byte. + + Args: + idx (int): device index. + + Return: + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. + """ + return None, None + + def get_device_row_remapped_info(self, idx): + """Get the row remapped information of device. + + Args: + idx (int): device index. + + Return: + remapped_metrics (dict): the row remapped information, None means failed to get the data. + """ + return None + + def get_device_ecc_error(self, idx): + """Get the ecc error information of device. + + Args: + idx (int): device index. + + Return: + corrected_ecc (int) : the count of single bit ecc error. + uncorrected_ecc (int): the count of double bit ecc error. + """ + return None, None + + +class NvidiaDeviceManager(DeviceManager): + """Device management module for Nvidia.""" + def __init__(self): + """Constructor.""" + nvml.nvmlInit() + super().__init__() + self._device_handlers = list() for i in range(self._device_count): self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i)) + def __del__(self): + """Destructor.""" + nvml.nvmlShutdown() + def get_device_count(self): - """Get the compute capability of device. + """Get the number of device. Return: count (int): count of device. @@ -79,7 +193,7 @@ def get_device_power(self, idx): idx (int): device index. Return: - temp (float): the realtime power of device, None means failed to get the data. + temp (int): the realtime power of device, None means failed to get the data. """ try: power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) @@ -95,7 +209,7 @@ def get_device_power_limit(self, idx): idx (int): device index. Return: - temp (float): the power management limit of device, None means failed to get the data. + temp (int): the power management limit of device, None means failed to get the data. """ try: powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) @@ -111,8 +225,8 @@ def get_device_memory(self, idx): idx (int): device index. Return: - used (float): the used device memory, None means failed to get the data. - total (float): the total device memory, None means failed to get the data. + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. """ try: mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) @@ -208,4 +322,115 @@ def get_device_ecc_error(self, idx): return corrected_ecc, uncorrected_ecc -device_manager = DeviceManager() +class AmdDeviceManager(DeviceManager): + """Device management module for AMD.""" + def __init__(self): + """Constructor.""" + rocml.smi_initialize() + super().__init__() + + def __del__(self): + """Destructor.""" + rocml.smi_shutdown() + + def get_device_count(self): + """Get the number of device. + + Return: + count (int): count of device. + """ + return rocml.smi_get_device_count() + + def get_device_utilization(self, idx): + """Get the utilization of device. + + Args: + idx (int): device index. + + Return: + util (int): the utilization of device, None means failed to get the data. + """ + try: + util = rocml.smi_get_device_utilization(idx) + except Exception as err: + logger.error('Get device utilization failed: {}'.format(str(err))) + return None + return util + + def get_device_temperature(self, idx): + """Get the temperature of device, unit: celsius. + + Args: + idx (int): device index. + + Return: + temp (int): the temperature of device, None means failed to get the data. + """ + # Currently no API provided in rocml. + return None + + def get_device_power(self, idx): + """Get the realtime power of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the realtime power of device, None means failed to get the data. + """ + try: + power = rocml.smi_get_device_average_power(idx) + except Exception as err: + logger.error('Get device power failed: {}'.format(str(err))) + return None + return int(int(power) / 1000) + + def get_device_power_limit(self, idx): + """Get the power management limit of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the power management limit of device, None means failed to get the data. + """ + # Currently no API provided in rocml. + return None + + def get_device_memory(self, idx): + """Get the memory information of device, unit: byte. + + Args: + idx (int): device index. + + Return: + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. + """ + try: + mem_used = rocml.smi_get_device_memory_used(idx) + mem_total = rocml.smi_get_device_memory_total(idx) + except Exception as err: + logger.error('Get device memory failed: {}'.format(str(err))) + return None, None + return mem_used, mem_total + + def get_device_ecc_error(self, idx): + """Get the ecc error information of device. + + Args: + idx (int): device index. + + Return: + corrected_ecc (int) : the count of single bit ecc error. + uncorrected_ecc (int): the count of double bit ecc error. + """ + # Currently no API provided in rocml. + return None, None + + +device_manager: Optional[DeviceManager] = DeviceManager() +if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': + device_manager = NvidiaDeviceManager() +elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': + device_manager = AmdDeviceManager() diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index ca2b78093..bfff5cb7c 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -218,14 +218,14 @@ def exec(self): monitor = None if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: - if self.__get_platform() == Platform.CUDA: + if self.__get_platform() is not Platform.CPU: monitor = Monitor( None, int(self._sb_monitor_config.sample_duration or 10), int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) ) monitor.start() else: - logger.warning('Monitor can not support ROCM/CPU platform.') + logger.warning('Monitor can not support CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] for framework in benchmark_config.frameworks or [Framework.NONE.value]: