Skip to content

Commit

Permalink
Print warning if gpu metrics cannot be reported (#219)
Browse files Browse the repository at this point in the history
  • Loading branch information
aniezurawski authored Apr 6, 2020
1 parent 0a84c1d commit 1e3e105
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
14 changes: 13 additions & 1 deletion neptune/internal/hardware/gpu/gpu_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,19 @@
# limitations under the License.
#

import logging
import time

from py3nvml.py3nvml import NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, \
nvmlDeviceGetUtilizationRates, nvmlInit

_logger = logging.getLogger(__name__)

class GPUMonitor(object):

nvml_error_time = 0
nvml_error_period = 30

def get_card_count(self):
return self.__nvml_get_or_else(nvmlDeviceGetCount, default=0)

Expand Down Expand Up @@ -51,5 +59,9 @@ def __nvml_get_or_else(self, getter, default=None):
try:
nvmlInit()
return getter()
except NVMLError:
except NVMLError as e:
timestamp = time.time()
if timestamp - GPUMonitor.nvml_error_time > GPUMonitor.nvml_error_period:
_logger.warning("NVMLError: %s - GPU usage metrics may not be reported.", e)
GPUMonitor.nvml_error_time = timestamp
return default
7 changes: 4 additions & 3 deletions neptune/internal/hardware/metrics/reports/metric_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,16 @@ def report(self, timestamp):
return [
MetricReport(
metric=metric,
values=[self.__metric_value_for_gauge(gauge, timestamp) for gauge in metric.gauges]
values=[x for x in [self.__metric_value_for_gauge(gauge, timestamp) for gauge in metric.gauges] if x]
)
for metric in self.__metrics
]

def __metric_value_for_gauge(self, gauge, timestamp):
value = gauge.value()
return MetricValue(
timestamp=timestamp,
running_time=timestamp - self.__reference_timestamp,
gauge_name=gauge.name(),
value=gauge.value()
)
value=value
) if value else None

0 comments on commit 1e3e105

Please sign in to comment.