From ffbd529bb49714bea4e43c0ab9284d82dd7b9d1c Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 10:28:00 +0800 Subject: [PATCH 1/8] add monitor support for amd device. --- setup.py | 3 +- superbench/common/utils/device_manager.py | 234 +++++++++++++++++++++- 2 files changed, 228 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 23c796833..0ab901645 100644 --- a/setup.py +++ b/setup.py @@ -183,7 +183,7 @@ def run(self): **x, 'develop': x['dev'] + x['test'], 'cpuworker': x['torch'], - 'amdworker': x['torch'] + x['ort'], + 'amdworker': x['torch'] + x['ort'] + x['amd'], 'nvworker': x['torch'] + x['ort'] + x['nvidia'], } )( @@ -216,6 +216,7 @@ def run(self): 'onnxruntime-gpu==1.10.0', ], 'nvidia': ['py3nvml>=0.2.6'], + 'amd': ['pyrsmi>=1.0.2'], } ), include_package_data=True, diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 2a6a8a889..cef32cab4 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -7,20 +7,130 @@ from superbench.common.utils import logger from superbench.common.utils import process +from superbench.common.devices import GPU + +gpu = GPU() +if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': + import py3nvml.py3nvml as nvml +elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': + from pyrsmi import rocml class DeviceManager: - """Device management module.""" + """Device management base module.""" def __init__(self): """Constructor.""" - nvml.nvmlInit() self._device_count = self.get_device_count() + + def get_device_count(self): + """Get the number of device. + + Return: + count (int): count of device. + """ + return 0 + + def get_device_compute_capability(self): + """Get the compute capability of device. + + Return: + cap (float): the compute capability of device, None means failed to get the data. + """ + return None + + def get_device_utilization(self, idx): + """Get the utilization of device. + + Args: + idx (int): device index. + + Return: + util (int): the utilization of device, None means failed to get the data. + """ + return None + + def get_device_temperature(self, idx): + """Get the temperature of device, unit: celsius. + + Args: + idx (int): device index. + + Return: + temp (int): the temperature of device, None means failed to get the data. + """ + return None + + def get_device_power(self, idx): + """Get the realtime power of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the realtime power of device, None means failed to get the data. + """ + return None + + def get_device_power_limit(self, idx): + """Get the power management limit of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the power management limit of device, None means failed to get the data. + """ + return None + + def get_device_memory(self, idx): + """Get the memory information of device, unit: byte. + + Args: + idx (int): device index. + + Return: + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. + """ + return None, None + + def get_device_row_remapped_info(self, idx): + """Get the row remapped information of device. + + Args: + idx (int): device index. + + Return: + remapped_metrics (dict): the row remapped information, None means failed to get the data. + """ + return None + + def get_device_ecc_error(self, idx): + """Get the ecc error information of device. + + Args: + idx (int): device index. + + Return: + corrected_ecc (int) : the count of single bit ecc error. + uncorrected_ecc (int): the count of double bit ecc error. + """ + return None, None + + +class NvidiaDeviceManager(DeviceManager): + """Device management module for Nvidia.""" + def __init__(self): + """Constructor.""" + nvml.nvmlInit() + super().__init__() + self._device_handlers = list() for i in range(self._device_count): self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i)) def get_device_count(self): - """Get the compute capability of device. + """Get the number of device. Return: count (int): count of device. @@ -79,7 +189,7 @@ def get_device_power(self, idx): idx (int): device index. Return: - temp (float): the realtime power of device, None means failed to get the data. + temp (int): the realtime power of device, None means failed to get the data. """ try: power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) @@ -95,7 +205,7 @@ def get_device_power_limit(self, idx): idx (int): device index. Return: - temp (float): the power management limit of device, None means failed to get the data. + temp (int): the power management limit of device, None means failed to get the data. """ try: powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) @@ -111,8 +221,8 @@ def get_device_memory(self, idx): idx (int): device index. Return: - used (float): the used device memory, None means failed to get the data. - total (float): the total device memory, None means failed to get the data. + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. """ try: mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) @@ -208,4 +318,112 @@ def get_device_ecc_error(self, idx): return corrected_ecc, uncorrected_ecc -device_manager = DeviceManager() +class AmdDeviceManager(DeviceManager): + """Device management module for AMD.""" + def __init__(self): + """Constructor.""" + rocml.smi_initialize() + super().__init__() + + def get_device_count(self): + """Get the number of device. + + Return: + count (int): count of device. + """ + return rocml.smi_get_device_count() + + def get_device_utilization(self, idx): + """Get the utilization of device. + + Args: + idx (int): device index. + + Return: + util (int): the utilization of device, None means failed to get the data. + """ + try: + util = rocml.smi_get_device_utilization(idx) + except Exception as err: + logger.error('Get device utilization failed: {}'.format(str(err))) + return None + return util + + def get_device_temperature(self, idx): + """Get the temperature of device, unit: celsius. + + Args: + idx (int): device index. + + Return: + temp (int): the temperature of device, None means failed to get the data. + """ + # Currently no API provided in rocml. + return None + + def get_device_power(self, idx): + """Get the realtime power of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the realtime power of device, None means failed to get the data. + """ + try: + power = rocml.smi_get_device_average_power(idx) + except Exception as err: + logger.error('Get device power failed: {}'.format(str(err))) + return None + return int(int(power) / 1000) + + def get_device_power_limit(self, idx): + """Get the power management limit of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (int): the power management limit of device, None means failed to get the data. + """ + # Currently no API provided in rocml. + return None + + def get_device_memory(self, idx): + """Get the memory information of device, unit: byte. + + Args: + idx (int): device index. + + Return: + used (int): the used device memory in bytes, None means failed to get the data. + total (int): the total device memory in bytes, None means failed to get the data. + """ + try: + mem_used = rocml.smi_get_device_memory_used(idx) + mem_total = rocml.smi_get_device_memory_total(idx) + except Exception as err: + logger.error('Get device memory failed: {}'.format(str(err))) + return None, None + return mem_used, mem_total + + def get_device_ecc_error(self, idx): + """Get the ecc error information of device. + + Args: + idx (int): device index. + + Return: + corrected_ecc (int) : the count of single bit ecc error. + uncorrected_ecc (int): the count of double bit ecc error. + """ + # Currently no API provided in rocml. + return None, None + + +device_namager = DeviceManager() + +if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': + device_manager = NvidiaDeviceManager() +elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': + device_manager = AmdDeviceManager() From 04279e4bcdc112135b8e739b02025efa9950c16d Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 10:46:26 +0800 Subject: [PATCH 2/8] fix --- superbench/common/utils/device_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index cef32cab4..133bc6fd3 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -3,8 +3,6 @@ """Device Managerment Library Utility.""" -import py3nvml.py3nvml as nvml - from superbench.common.utils import logger from superbench.common.utils import process from superbench.common.devices import GPU From 42a715d31e2db7f9d06909512317d1b76eae9a5d Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 10:58:19 +0800 Subject: [PATCH 3/8] add destructor. --- superbench/common/utils/device_manager.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 133bc6fd3..8af749a10 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -127,6 +127,10 @@ def __init__(self): for i in range(self._device_count): self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i)) + def __del__(self): + """Destructor.""" + nvml.nvmlShutdown() + def get_device_count(self): """Get the number of device. @@ -323,6 +327,10 @@ def __init__(self): rocml.smi_initialize() super().__init__() + def __del__(self): + """Destructor.""" + rocml.smi_shutdown() + def get_device_count(self): """Get the number of device. From 86c82c175a45de36f1cd4dde24e6dd392c08fdd6 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 11:46:59 +0800 Subject: [PATCH 4/8] fix lint --- superbench/common/utils/device_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index 8af749a10..09398cac0 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -3,6 +3,8 @@ """Device Managerment Library Utility.""" +from typing import Optional + from superbench.common.utils import logger from superbench.common.utils import process from superbench.common.devices import GPU @@ -427,8 +429,7 @@ def get_device_ecc_error(self, idx): return None, None -device_namager = DeviceManager() - +device_manager: Optional[DeviceManager] = DeviceManager() if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': device_manager = NvidiaDeviceManager() elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': From f7a96dedeb89e9be5d722cbe1b8cf6840e945232 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 12:13:38 +0800 Subject: [PATCH 5/8] change pyrsmi version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0ab901645..faebc57dc 100644 --- a/setup.py +++ b/setup.py @@ -216,7 +216,7 @@ def run(self): 'onnxruntime-gpu==1.10.0', ], 'nvidia': ['py3nvml>=0.2.6'], - 'amd': ['pyrsmi>=1.0.2'], + 'amd': ['pyrsmi>=1.0.1'], } ), include_package_data=True, From 4edb27b6a338ab8f331adbf6b41e542c2077893d Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 14:53:09 +0800 Subject: [PATCH 6/8] enable in executor --- superbench/executor/executor.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index ca2b78093..572efbe64 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -218,14 +218,11 @@ def exec(self): monitor = None if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: - if self.__get_platform() == Platform.CUDA: - monitor = Monitor( - None, int(self._sb_monitor_config.sample_duration or 10), - int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) - ) - monitor.start() - else: - logger.warning('Monitor can not support ROCM/CPU platform.') + monitor = Monitor( + None, int(self._sb_monitor_config.sample_duration or 10), + int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) + ) + monitor.start() benchmark_real_name = benchmark_name.split(':')[0] for framework in benchmark_config.frameworks or [Framework.NONE.value]: From 000d29e92f7168586ce724a7146d811f27f23739 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 23 Nov 2023 20:16:57 +0800 Subject: [PATCH 7/8] reduce the time for UT. --- superbench/config/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 1a6af7dc5..2ea39ca6f 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -3,7 +3,7 @@ version: v0.9 superbench: enable: null monitor: - enable: true + enable: false sample_duration: 1 sample_interval: 10 var: From 31c1be053f5edfedfd7cfd85804a59db8eab4392 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Sun, 26 Nov 2023 18:20:47 +0800 Subject: [PATCH 8/8] revert some changes --- superbench/config/default.yaml | 2 +- superbench/executor/executor.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 2ea39ca6f..1a6af7dc5 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -3,7 +3,7 @@ version: v0.9 superbench: enable: null monitor: - enable: false + enable: true sample_duration: 1 sample_interval: 10 var: diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 572efbe64..bfff5cb7c 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -218,11 +218,14 @@ def exec(self): monitor = None if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: - monitor = Monitor( - None, int(self._sb_monitor_config.sample_duration or 10), - int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) - ) - monitor.start() + if self.__get_platform() is not Platform.CPU: + monitor = Monitor( + None, int(self._sb_monitor_config.sample_duration or 10), + int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) + ) + monitor.start() + else: + logger.warning('Monitor can not support CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] for framework in benchmark_config.frameworks or [Framework.NONE.value]: