From bbc1cda1c3105b4bbdf1951e029491bd7151dc08 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Mon, 8 Jan 2024 12:50:50 -0500 Subject: [PATCH] Replace GPUtil with pynvml for benchmark reports (#1451) + Replace `GPUtil` with `pynvml` to collect GPU stats for benchmark reports. + Remove `GPUtil` from dependency yaml's. `pynvml` is already installed via `nvtabular`. Closes #1446 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1451 --- ci/conda/recipes/morpheus/meta.yaml | 4 +- .../all_cuda-118_arch-x86_64.yaml | 1 - .../dev_cuda-118_arch-x86_64.yaml | 1 - dependencies.yaml | 3 +- docker/conda/environments/cuda11.8_dev.yml | 3 +- .../morpheus/benchmarks/conftest.py | 51 +++++++++++++----- tests/benchmarks/conftest.py | 52 ++++++++++++++----- 7 files changed, 79 insertions(+), 36 deletions(-) diff --git a/ci/conda/recipes/morpheus/meta.yaml b/ci/conda/recipes/morpheus/meta.yaml index 365cecd448..039920e272 100644 --- a/ci/conda/recipes/morpheus/meta.yaml +++ b/ci/conda/recipes/morpheus/meta.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -110,7 +110,7 @@ outputs: - {{ pin_compatible('cudatoolkit', min_pin='x.x', max_pin='x') }} test: requires: - - gputil + - pynvml - pytest - pytest-cov - pytest-benchmark diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index d2557e605b..884973ebf0 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -46,7 +46,6 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs -- gputil - grpcio - gxx_linux-64=11.2 - huggingface_hub=0.10.1 diff --git a/conda/environments/dev_cuda-118_arch-x86_64.yaml b/conda/environments/dev_cuda-118_arch-x86_64.yaml index c0d127a07f..b13d2e8d64 100644 --- a/conda/environments/dev_cuda-118_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-118_arch-x86_64.yaml @@ -38,7 +38,6 @@ dependencies: - flake8 - gcc_linux-64=11.2 - git-lfs -- gputil - grpcio - gxx_linux-64=11.2 - include-what-you-use=0.20 diff --git a/dependencies.yaml b/dependencies.yaml index fd035b733d..6b31678d78 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -235,7 +235,6 @@ dependencies: - dill - elasticsearch==8.9.0 - feedparser=6.0.10 - - gputil - grpcio - mlflow>=2.2.1,<3 - nb_conda_kernels diff --git a/docker/conda/environments/cuda11.8_dev.yml b/docker/conda/environments/cuda11.8_dev.yml index 876330634c..5ee09141b0 100644 --- a/docker/conda/environments/cuda11.8_dev.yml +++ b/docker/conda/environments/cuda11.8_dev.yml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -57,7 +57,6 @@ dependencies: - git>=2.35.3 # Needed for wildcards on safe.directory - glog=0.6 - gmock>=1.13.0 - - gputil - grpcio - gtest>=1.13.0 - gxx_linux-64=11.2 diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py index eaaadf9236..63c0dccc0a 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/conftest.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,7 +17,8 @@ import json from os import path -import GPUtil +from pynvml.smi import NVSMI_QUERY_GPU +from pynvml.smi import nvidia_smi from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF @@ -32,18 +33,40 @@ def pytest_benchmark_update_json(config, benchmarks, output_json): # pylint:dis curr_dir = path.dirname(path.abspath(__file__)) - gpus = GPUtil.getGPUs() - - for i, gpu in enumerate(gpus): - # output_json["machine_info"]["gpu_" + str(i)] = gpu.name - output_json["machine_info"]["gpu_" + str(i)] = {} - output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id - output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name - output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%" - output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB" - output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB" - output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C" - output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid + query_opts = NVSMI_QUERY_GPU.copy() + nvsmi = nvidia_smi.getInstance() + device_query = nvsmi.DeviceQuery([ + query_opts["driver_version"], + query_opts["count"], + query_opts["index"], + query_opts["gpu_name"], + query_opts["gpu_uuid"], + query_opts["memory.total"], + query_opts["memory.used"], + query_opts["memory.free"], + query_opts["utilization.gpu"], + query_opts["utilization.memory"], + query_opts["temperature.gpu"] + ]) + + output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"] + + for gpu in device_query["gpu"]: + gpu_num = gpu["minor_number"] + output_json["machine_info"]["gpu_" + gpu_num] = {} + output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num + output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"] + output_json["machine_info"][ + "gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}" + output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"] for bench in output_json['benchmarks']: diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index ff83e4b9a3..f877612bb6 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,25 +19,49 @@ import typing from unittest import mock -import GPUtil import pytest +from pynvml.smi import NVSMI_QUERY_GPU +from pynvml.smi import nvidia_smi from test_bench_e2e_pipelines import E2E_TEST_CONFIGS # pylint: disable=unused-argument def pytest_benchmark_update_json(config, benchmarks, output_json): - gpus = GPUtil.getGPUs() - - for i, gpu in enumerate(gpus): - # output_json["machine_info"]["gpu_" + str(i)] = gpu.name - output_json["machine_info"]["gpu_" + str(i)] = {} - output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id - output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name - output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%" - output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB" - output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB" - output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C" - output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid + + query_opts = NVSMI_QUERY_GPU.copy() + nvsmi = nvidia_smi.getInstance() + device_query = nvsmi.DeviceQuery([ + query_opts["driver_version"], + query_opts["count"], + query_opts["index"], + query_opts["gpu_name"], + query_opts["gpu_uuid"], + query_opts["memory.total"], + query_opts["memory.used"], + query_opts["memory.free"], + query_opts["utilization.gpu"], + query_opts["utilization.memory"], + query_opts["temperature.gpu"] + ]) + + output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"] + + for gpu in device_query["gpu"]: + gpu_num = gpu["minor_number"] + output_json["machine_info"]["gpu_" + gpu_num] = {} + output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num + output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"] + output_json["machine_info"][ + "gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}" + output_json["machine_info"][ + "gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}" + output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"] for bench in output_json['benchmarks']: if bench["name"] not in E2E_TEST_CONFIGS: