Skip to content

Commit

Permalink
Merge pull request #29 from mmcdermott/23_repeatable_benchmark
Browse files Browse the repository at this point in the history
Major revisions to the utility and generality of the benchmarking code.
  • Loading branch information
mmcdermott authored Sep 12, 2024
2 parents d239733 + 0f1dbb5 commit 1b41e1b
Show file tree
Hide file tree
Showing 31 changed files with 949 additions and 564 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/benchmark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Benchmark Performance
on:
push:
branches: [main]
pull_request:
branches: [main, "release/*", "dev"]

permissions:
contents: write
deployments: write

jobs:
benchmark:
name: Run benchmark
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.11

- name: Install packages
run: |
pip install -e .[tests,benchmarks]
pip install sample_dataset_builder/
- name: Run benchmark
run: |
pytest benchmark/run.py
- name: Store benchmark result
uses: benchmark-action/github-action-benchmark@v1
with:
name: Benchmark
tool: "customSmallerIsBetter"
output-file-path: benchmark/outputs/output_32_512_5.json
# Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
github-token: ${{ secrets.GITHUB_TOKEN }}
auto-push: true
# Show alert with commit comment on detecting possible performance regression
alert-threshold: "200%"
comment-on-alert: true
fail-on-alert: true
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
#----------------------------------------------
- name: Run tests
run: |
pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=performance_tests
pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=benchmark --ignore=sample_dataset_builder
- name: Upload coverage to Codecov
uses: codecov/[email protected]
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,8 @@ cython_debug/
#.idea/

performance_tests_outputs/
sample_dataset/*.log
sample_dataset/memray_stats.json
sample_dataset/.memray
sample_dataset/.hydra/*
benchmark/outputs
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
default_language_version:
python: python3

exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
exclude: "sample_dataset/dataset.pkl"

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down Expand Up @@ -38,6 +38,7 @@ repos:
rev: v2.2.0
hooks:
- id: autoflake
args: [--in-place, --remove-all-unused-imports]

# python upgrading syntax to newer version
- repo: https://github.com/asottile/pyupgrade
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,10 +322,12 @@ which leverage Hugging Face's `safetensors` library for internal manipulation.

## Performance Testing

Run `python performance_tests/test_times.py` for a comparison across several strategies of using these data.
Performance over time on various aspects of an approximate pytorch dataset using this repo can be seen at
[https://mmcdermott.github.io/nested_ragged_tensors/dev/bench/](https://mmcdermott.github.io/nested_ragged_tensors/dev/bench/)

For example, to use a configuration of nested events and codes that is similar to the MIMIC-IV dataset, you
can run the below code (note this takes a lot of memory for the "dense" view of the data).
In older commits (see the GitHub history for more details), you could also run `python performance_tests/test_times.py` for a comparison across several strategies of using these data. A
re-introduction of this feature in a more user-friendly format is planned for the future, in concert with the
tracking over time of the performance of this package documented at the above link.

```python
python performance_tests/test_times.py dataset_spec=mimic dataset_spec.num_patients=1250 dataset_spec.max_events_per_item=256 batch_size=64
Expand Down
6 changes: 6 additions & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Performance Benchmark

This runs a repeatable performance benchmark on a sample dataset for the NRT code.

If a sample dataset does not exist, you can generate it using the `sample_dataset_builder` package included in
this repo.
File renamed without changes.
150 changes: 150 additions & 0 deletions benchmark/benchmarkable_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import logging
import os
import sys
from abc import ABC, abstractmethod
from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime, timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any

import torch
from memray import Tracker
from mixins import TimeableMixin
from torch.utils.data import DataLoader, Dataset

from sample_dataset_builder import SAMPLE_DATASET_T

logger = logging.getLogger(__name__)

import json
import subprocess


def get_memray_stats(memray_tracker_fp: Path, memray_stats_fp: Path) -> dict:
memray_stats_cmd = f"memray stats {memray_tracker_fp} --json -o {memray_stats_fp} -f"
subprocess.run(memray_stats_cmd, shell=True, check=True, capture_output=True)
try:
return json.loads(memray_stats_fp.read_text())
except Exception as e:
raise ValueError(f"Failed to parse memray stats file at {memray_stats_fp}") from e


@contextmanager
def TrackMemoryIn(memory_stats: dict):
with TemporaryDirectory() as tmpdir:
memray_fp = Path(tmpdir) / ".memray"
memray_stats_fp = Path(tmpdir) / "memray_stats.json"

try:
with Tracker(memray_fp, follow_fork=True):
yield memory_stats
finally:
memory_stats.update(get_memray_stats(memray_fp, memray_stats_fp))


class BenchmarkableDataset(Dataset, TimeableMixin, ABC):
def __init__(
self,
data_dir: Path,
max_seq_len: int | None = None,
task_bounds: list[tuple[int, int, int]] | None = None,
):
super().__init__()
self.memory_stats = {}

# TODO(mmd): Need to handle min seq length too.
self.max_seq_len = max_seq_len
self.task_bounds = task_bounds
self.init_from_disk(data_dir)
if not hasattr(self, "N"):
raise AttributeError("Dataset must have attribute 'N' after reading data.")

@classmethod
@contextmanager
def TemporaryDataset(cls, data: SAMPLE_DATASET_T, root_dir: Path):
with TemporaryDirectory(prefix=str(root_dir.resolve())) as tmpdir:
tmpdir = Path(tmpdir)

cnstr_kwargs, prep_times, prep_mem_stats = cls._prep(data, tmpdir)

disk_size = sum((Path(d) / f).stat().st_size for d, _, files in os.walk(tmpdir) for f in files)
yield cnstr_kwargs, (prep_times, disk_size, prep_mem_stats)

@classmethod
@abstractmethod
def _build(cls, data: SAMPLE_DATASET_T) -> Any:
raise NotImplementedError

@classmethod
@abstractmethod
def write(cls, data: Any, fp: Path):
raise NotImplementedError

def init_from_disk(self, data_dir: Path):
self.memory_stats["read"] = {}
with TrackMemoryIn(self.memory_stats["read"]):
self.read(data_dir)

@abstractmethod
@TimeableMixin.TimeAs
def read(self, fp: Path):
raise NotImplementedError

@classmethod
def _prep(cls, data: SAMPLE_DATASET_T, data_dir: Path) -> tuple[dict[str, Any], dict[str, timedelta]]:
"""This should return keyword args to build a dataset leveraging the storage directory."""
memory_stats = {"build": {}, "write": {}}
with TrackMemoryIn(memory_stats["build"]):
st = datetime.now()
data_obj = cls._build(data)
build_time = datetime.now() - st

data_dir.mkdir(parents=True, exist_ok=True)
with TrackMemoryIn(memory_stats["write"]):
st = datetime.now()
cls.write(data_obj, data_dir)
write_time = datetime.now() - st

return {"data_dir": data_dir}, {"build": build_time, "write": write_time}, memory_stats

def __len__(self) -> int:
return self.N

@abstractmethod
@TimeableMixin.TimeAs
def __getitem__(self, i):
raise NotImplementedError

@abstractmethod
@TimeableMixin.TimeAs
def collate(self, batch: list[dict]) -> dict:
raise NotImplementedError

def dataloader(self, *args, **kwargs) -> DataLoader:
return DataLoader(self, *args, collate_fn=self.collate, **kwargs)

@staticmethod
def tensor_size(a: torch.Tensor) -> int:
return sys.getsizeof(a) + torch.numel(a) * a.element_size()

@TimeableMixin.TimeAs
def benchmark(self, batch_size: int, num_epochs: int = 1) -> tuple[dict[str, list[int]], list[timedelta]]:
torch.manual_seed(1)

dataloader = self.dataloader(batch_size=batch_size, shuffle=True)

sizes = defaultdict(list)
epoch_durations = []

self.memory_stats["benchmark"] = {}
with TrackMemoryIn(self.memory_stats["benchmark"]):
for epoch in range(num_epochs):
epoch_start = datetime.now()
for B in dataloader:
for k, v in B.items():
sizes[k].append(BenchmarkableDataset.tensor_size(v))
epoch_durations.append(datetime.now() - epoch_start)

return sizes, epoch_durations
78 changes: 78 additions & 0 deletions benchmark/nrt_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pickle
from functools import cached_property
from pathlib import Path

import numpy as np
import torch
from mixins import TimeableMixin
from torch.utils.data import default_collate

from nested_ragged_tensors.ragged_numpy import JointNestedRaggedTensorDict
from sample_dataset_builder import SAMPLE_DATASET_T

from .benchmarkable_dataset import BenchmarkableDataset


class NRTDataset(BenchmarkableDataset):
@classmethod
def _build(cls, data: SAMPLE_DATASET_T) -> tuple[dict, JointNestedRaggedTensorDict]:
static_data = None
static_keys = [k for k in data if k.startswith("static_")]
for k in static_keys:
S = data.pop(k)
if static_data is None:
static_data = [{} for _ in range(len(S))]
for i in range(len(S)):
static_data[i][k] = S[i]

return (static_data, JointNestedRaggedTensorDict(data))

@cached_property
def index(self):
if hasattr(self, "task_bounds") and self.task_bounds:
return self.task_bounds
else:
return [(i, None, None) for i in range(len(self.dynamic_data))]

@TimeableMixin.TimeAs
def read(self, read_dir: Path):
self.dynamic_data = JointNestedRaggedTensorDict.load(read_dir / "dynamics.nrt")
with open(read_dir / "static_data.pkl", "rb") as f:
self.static_data = pickle.load(f)
self.N = len(self.index)

@classmethod
def write(cls, data: tuple[dict, JointNestedRaggedTensorDict], data_dir: Path):
static_data, dynamic_data = data

dynamics_fp = data_dir / "dynamics.nrt"
dynamic_data.save(dynamics_fp)

static_data_fp = data_dir / "static_data.pkl"
with open(static_data_fp, "wb") as f:
pickle.dump(static_data, f)

@TimeableMixin.TimeAs
def __getitem__(self, i):
i, start, end = self.index[i]
dynamic_data = self.dynamic_data[i]
static_data = self.static_data[i]

if start is not None or end is not None:
dynamic_data = dynamic_data[start:end]

if self.max_seq_len is not None:
L = len(dynamic_data)
if L > self.max_seq_len:
start = np.random.randint(0, L - self.max_seq_len)
dynamic_data = dynamic_data[start : start + self.max_seq_len]

return (static_data, dynamic_data)

@TimeableMixin.TimeAs
def collate(self, batch: list[tuple[dict, JointNestedRaggedTensorDict]]) -> dict:
dynamics = [d for _, d in batch]
collated_dynamics = JointNestedRaggedTensorDict.vstack(dynamics).to_dense()
collated_dynamics = {k: torch.from_numpy(v) for k, v in collated_dynamics.items()}
collated_static_data = default_collate([s for s, _ in batch])
return {**collated_static_data, **collated_dynamics}
Loading

0 comments on commit 1b41e1b

Please sign in to comment.