Skip to content

Commit

Permalink
Merge pull request #191 from rapidsai/branch-0.10
Browse files Browse the repository at this point in the history
Forward merge v0.10 => v0.11
  • Loading branch information
raydouglass authored Apr 15, 2024
2 parents 7a2a56a + 05567f7 commit 578b58b
Show file tree
Hide file tree
Showing 28 changed files with 646 additions and 425 deletions.
4 changes: 4 additions & 0 deletions .github/ops-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This file controls which features from the `ops-bot` repository below are enabled.
# - https://github.com/rapidsai/ops-bot

forward_merger: true
2 changes: 1 addition & 1 deletion ci/check_style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rapids-dependency-file-generator \
--file_key checks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml

rapids-mamba-retry env create --force -f env.yaml -n checks
rapids-mamba-retry env create --yes -f env.yaml -n checks
conda activate checks

rapids-logger "Run pre-commit checks - Python backend"
Expand Down
4 changes: 2 additions & 2 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ rapids-dependency-file-generator \
--file_key test_python \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml

rapids-mamba-retry env create --force -f env.yaml -n test
rapids-mamba-retry env create --yes -f env.yaml -n test

# Temporarily allow unbound variables for conda activation.
set +u
conda activate test
set -u

# rapids-logger "Downloading artifacts from previous jobs"
rapids-logger "Downloading artifacts from previous jobs"
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)

rapids-print-env
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/all_arch-any.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ dependencies:
- psutil
- pynvml
- pytest
- pytest-asyncio
- pytest-jupyter[server]>=0.6.0
- python>=3.8
- websockets
name: all_arch-any
2 changes: 2 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,5 @@ dependencies:
packages:
- pytest
- pytest-jupyter[server]>=0.6.0
- pytest-asyncio
- websockets
11 changes: 4 additions & 7 deletions jupyterlab_nvdashboard/apps/cpu.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import json
import psutil
import time
import tornado
from jupyter_server.base.handlers import APIHandler
from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler


class CPUResourceHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class CPUResourceWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
now = time.time()
stats = {
"time": now * 1000,
Expand All @@ -18,5 +16,4 @@ def get(self):
"network_read": psutil.net_io_counters().bytes_recv,
"network_write": psutil.net_io_counters().bytes_sent,
}
self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))
41 changes: 16 additions & 25 deletions jupyterlab_nvdashboard/apps/gpu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import json
from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler
import pynvml
import time
import tornado
from jupyter_server.base.handlers import APIHandler

try:
pynvml.nvmlInit()
Expand Down Expand Up @@ -41,19 +40,17 @@
pci_gen = None


class GPUUtilizationHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUUtilizationWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
gpu_utilization = [
pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
for i in range(ngpus)
]
self.finish(json.dumps({"gpu_utilization": gpu_utilization}))
self.write_message(json.dumps({"gpu_utilization": gpu_utilization}))


class GPUUsageHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUUsageWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
memory_usage = [
pynvml.nvmlDeviceGetMemoryInfo(handle).used
for handle in gpu_handles
Expand All @@ -64,16 +61,15 @@ def get(self):
for handle in gpu_handles
]

self.finish(
self.write_message(
json.dumps(
{"memory_usage": memory_usage, "total_memory": total_memory}
)
)


class GPUResourceHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUResourceWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
now = time.time()
stats = {
"time": now * 1000,
Expand Down Expand Up @@ -118,15 +114,13 @@ def get(self):
stats["gpu_memory_total"] = round(
(stats["gpu_memory_total"] / gpu_mem_sum) * 100, 2
)
self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))


class NVLinkThroughputHandler(APIHandler):
class NVLinkThroughputWebSocketHandler(CustomWebSocketHandler):
prev_throughput = None

@tornado.web.authenticated
def get(self):
def send_data(self):
throughput = [
pynvml.nvmlDeviceGetFieldValues(
handle,
Expand Down Expand Up @@ -162,9 +156,8 @@ def get(self):
# Store the current throughput for the next request
self.prev_throughput = throughput

self.set_header("Content-Type", "application/json")
# Send the change in throughput as part of the response
self.write(
self.write_message(
json.dumps(
{
"nvlink_rx": [
Expand All @@ -191,9 +184,8 @@ def get(self):
)


class PCIStatsHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class PCIStatsWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
# Use device-0 to get "upper bound"
pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0])
pci_bw = {
Expand Down Expand Up @@ -231,5 +223,4 @@ def get(self):
"max_rxtx_tp": max_rxtx_tp,
}

self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))
31 changes: 31 additions & 0 deletions jupyterlab_nvdashboard/apps/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from tornado.websocket import WebSocketHandler
import tornado
import json


class CustomWebSocketHandler(WebSocketHandler):
def open(self):
self.write_message(json.dumps({"status": "connected"}))
self.set_nodelay(True)
# Start a periodic callback to send data every 50ms
self.callback = tornado.ioloop.PeriodicCallback(self.send_data, 1000)
self.callback.start()

def on_message(self, message):
message_data = json.loads(message)
# Update the periodic callback frequency
new_frequency = message_data["updateFrequency"]
if hasattr(self, "callback"):
self.callback.stop()
self.callback = tornado.ioloop.PeriodicCallback(
self.send_data, new_frequency
)
if not message_data["isPaused"]:
self.callback.start()

def on_close(self):
if hasattr(self, "callback") and self.callback.is_running():
self.callback.stop()

def send_data(self):
pass
12 changes: 6 additions & 6 deletions jupyterlab_nvdashboard/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ def setup_handlers(web_app):
base_url, URL_PATH, "nvlink_throughput"
)
handlers += [
(route_pattern_gpu_util, apps.gpu.GPUUtilizationHandler),
(route_pattern_gpu_usage, apps.gpu.GPUUsageHandler),
(route_pattern_gpu_resource, apps.gpu.GPUResourceHandler),
(route_pattern_pci_stats, apps.gpu.PCIStatsHandler),
(route_pattern_gpu_util, apps.gpu.GPUUtilizationWebSocketHandler),
(route_pattern_gpu_usage, apps.gpu.GPUUsageWebSocketHandler),
(route_pattern_gpu_resource, apps.gpu.GPUResourceWebSocketHandler),
(route_pattern_pci_stats, apps.gpu.PCIStatsWebSocketHandler),
(
route_pattern_nvlink_throughput,
apps.gpu.NVLinkThroughputHandler,
apps.gpu.NVLinkThroughputWebSocketHandler,
),
]

Expand All @@ -41,7 +41,7 @@ def setup_handlers(web_app):
)

handlers += [
(route_pattern_cpu_resource, apps.cpu.CPUResourceHandler),
(route_pattern_cpu_resource, apps.cpu.CPUResourceWebSocketHandler),
]

web_app.add_handlers(host_pattern, handlers)
2 changes: 2 additions & 0 deletions jupyterlab_nvdashboard/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def pytest_configure(config):
config.addinivalue_line("markers", "asyncio: mark test as asyncio")
37 changes: 37 additions & 0 deletions jupyterlab_nvdashboard/tests/test_cpu_handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json
import pytest
from unittest.mock import MagicMock, patch

from jupyterlab_nvdashboard.apps.cpu import CPUResourceWebSocketHandler


@pytest.fixture
def mock_handler(monkeypatch):
mock = MagicMock()
monkeypatch.setattr(
"jupyterlab_nvdashboard.apps.cpu.CustomWebSocketHandler.write_message",
mock,
)
return mock


@pytest.fixture
def handler_args():
with patch("tornado.web.Application") as mock_application, patch(
"tornado.httputil.HTTPServerRequest"
) as mock_request:
yield mock_application, mock_request


def test_cpu_resource_handler(mock_handler, handler_args):
handler = CPUResourceWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "time" in data
assert "cpu_utilization" in data
assert "memory_usage" in data
assert "disk_read" in data
assert "disk_write" in data
assert "network_read" in data
assert "network_write" in data
80 changes: 80 additions & 0 deletions jupyterlab_nvdashboard/tests/test_gpu_handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
import pytest
from unittest.mock import MagicMock, patch

from jupyterlab_nvdashboard.apps.gpu import (
GPUUtilizationWebSocketHandler,
GPUUsageWebSocketHandler,
GPUResourceWebSocketHandler,
NVLinkThroughputWebSocketHandler,
PCIStatsWebSocketHandler,
)


@pytest.fixture
def mock_handler(monkeypatch):
mock = MagicMock()
monkeypatch.setattr(
"jupyterlab_nvdashboard.apps.gpu.CustomWebSocketHandler.write_message",
mock,
)
return mock


@pytest.fixture
def handler_args():
with patch("tornado.web.Application") as mock_application, patch(
"tornado.httputil.HTTPServerRequest"
) as mock_request:
yield mock_application, mock_request


def test_gpu_utilization_handler(mock_handler, handler_args):
handler = GPUUtilizationWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "gpu_utilization" in data


def test_gpu_usage_handler(mock_handler, handler_args):
handler = GPUUsageWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "memory_usage" in data
assert "total_memory" in data


def test_gpu_resource_handler(mock_handler, handler_args):
handler = GPUResourceWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "time" in data
assert "gpu_utilization_total" in data
assert "gpu_memory_total" in data
assert "rx_total" in data
assert "tx_total" in data
assert "gpu_memory_individual" in data
assert "gpu_utilization_individual" in data


def test_nvlink_throughput_handler(mock_handler, handler_args):
handler = NVLinkThroughputWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "nvlink_rx" in data
assert "nvlink_tx" in data
assert "max_rxtx_bw" in data


def test_pci_stats_handler(mock_handler, handler_args):
handler = PCIStatsWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "pci_tx" in data
assert "pci_rx" in data
assert "max_rxtx_tp" in data
Loading

0 comments on commit 578b58b

Please sign in to comment.