Merge pull request #29 from mmcdermott/23_repeatable_benchmark

Major revisions to the utility and generality of the benchmarking code.
mmcdermott · Sep 12, 2024 · 1b41e1b · 1b41e1b
2 parents d239733 + 0f1dbb5
commit 1b41e1b
Show file tree

Hide file tree

Showing 31 changed files with 949 additions and 564 deletions.
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -0,0 +1,43 @@
+name: Benchmark Performance
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main, "release/*", "dev"]
+
+permissions:
+  contents: write
+  deployments: write
+
+jobs:
+  benchmark:
+    name: Run benchmark
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+
+      - name: Install packages
+        run: |
+          pip install -e .[tests,benchmarks]
+          pip install sample_dataset_builder/
+
+      - name: Run benchmark
+        run: |
+          pytest benchmark/run.py
+
+      - name: Store benchmark result
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: Benchmark
+          tool: "customSmallerIsBetter"
+          output-file-path: benchmark/outputs/output_32_512_5.json
+          # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          # Show alert with commit comment on detecting possible performance regression
+          alert-threshold: "200%"
+          comment-on-alert: true
+          fail-on-alert: true
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -34,7 +34,7 @@ jobs:
       #----------------------------------------------
       - name: Run tests
         run: |
-          pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=performance_tests
+          pytest -v --doctest-modules --cov=src --junitxml=junit.xml -s --ignore=benchmark --ignore=sample_dataset_builder
 
       - name: Upload coverage to Codecov
         uses: codecov/[email protected]

diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,8 @@ cython_debug/
 #.idea/
 
 performance_tests_outputs/
+sample_dataset/*.log
+sample_dataset/memray_stats.json
+sample_dataset/.memray
+sample_dataset/.hydra/*
+benchmark/outputs
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 default_language_version:
   python: python3
 
-exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+exclude: "sample_dataset/dataset.pkl"
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -38,6 +38,7 @@ repos:
     rev: v2.2.0
     hooks:
       - id: autoflake
+        args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade

diff --git a/README.md b/README.md
@@ -322,10 +322,12 @@ which leverage Hugging Face's `safetensors` library for internal manipulation.
 
 ## Performance Testing
 
-Run `python performance_tests/test_times.py` for a comparison across several strategies of using these data.
+Performance over time on various aspects of an approximate pytorch dataset using this repo can be seen at
+[https://mmcdermott.github.io/nested_ragged_tensors/dev/bench/](https://mmcdermott.github.io/nested_ragged_tensors/dev/bench/)
 
-For example, to use a configuration of nested events and codes that is similar to the MIMIC-IV dataset, you
-can run the below code (note this takes a lot of memory for the "dense" view of the data).
+In older commits (see the GitHub history for more details), you could also run `python performance_tests/test_times.py` for a comparison across several strategies of using these data. A
+re-introduction of this feature in a more user-friendly format is planned for the future, in concert with the
+tracking over time of the performance of this package documented at the above link.
 
 ```python
 python performance_tests/test_times.py dataset_spec=mimic dataset_spec.num_patients=1250 dataset_spec.max_events_per_item=256 batch_size=64

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,6 @@
+# Performance Benchmark
+
+This runs a repeatable performance benchmark on a sample dataset for the NRT code.
+
+If a sample dataset does not exist, you can generate it using the `sample_dataset_builder` package included in
+this repo.
diff --git a/performance_tests/__init__.py → benchmark/__init__.py b/performance_tests/__init__.py → benchmark/__init__.py
diff --git a/benchmark/benchmarkable_dataset.py b/benchmark/benchmarkable_dataset.py
@@ -0,0 +1,150 @@
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from contextlib import contextmanager
+from datetime import datetime, timedelta
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any
+
+import torch
+from memray import Tracker
+from mixins import TimeableMixin
+from torch.utils.data import DataLoader, Dataset
+
+from sample_dataset_builder import SAMPLE_DATASET_T
+
+logger = logging.getLogger(__name__)
+
+import json
+import subprocess
+
+
+def get_memray_stats(memray_tracker_fp: Path, memray_stats_fp: Path) -> dict:
+    memray_stats_cmd = f"memray stats {memray_tracker_fp} --json -o {memray_stats_fp} -f"
+    subprocess.run(memray_stats_cmd, shell=True, check=True, capture_output=True)
+    try:
+        return json.loads(memray_stats_fp.read_text())
+    except Exception as e:
+        raise ValueError(f"Failed to parse memray stats file at {memray_stats_fp}") from e
+
+
+@contextmanager
+def TrackMemoryIn(memory_stats: dict):
+    with TemporaryDirectory() as tmpdir:
+        memray_fp = Path(tmpdir) / ".memray"
+        memray_stats_fp = Path(tmpdir) / "memray_stats.json"
+
+        try:
+            with Tracker(memray_fp, follow_fork=True):
+                yield memory_stats
+        finally:
+            memory_stats.update(get_memray_stats(memray_fp, memray_stats_fp))
+
+
+class BenchmarkableDataset(Dataset, TimeableMixin, ABC):
+    def __init__(
+        self,
+        data_dir: Path,
+        max_seq_len: int | None = None,
+        task_bounds: list[tuple[int, int, int]] | None = None,
+    ):
+        super().__init__()
+        self.memory_stats = {}
+
+        # TODO(mmd): Need to handle min seq length too.
+        self.max_seq_len = max_seq_len
+        self.task_bounds = task_bounds
+        self.init_from_disk(data_dir)
+        if not hasattr(self, "N"):
+            raise AttributeError("Dataset must have attribute 'N' after reading data.")
+
+    @classmethod
+    @contextmanager
+    def TemporaryDataset(cls, data: SAMPLE_DATASET_T, root_dir: Path):
+        with TemporaryDirectory(prefix=str(root_dir.resolve())) as tmpdir:
+            tmpdir = Path(tmpdir)
+
+            cnstr_kwargs, prep_times, prep_mem_stats = cls._prep(data, tmpdir)
+
+            disk_size = sum((Path(d) / f).stat().st_size for d, _, files in os.walk(tmpdir) for f in files)
+            yield cnstr_kwargs, (prep_times, disk_size, prep_mem_stats)
+
+    @classmethod
+    @abstractmethod
+    def _build(cls, data: SAMPLE_DATASET_T) -> Any:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def write(cls, data: Any, fp: Path):
+        raise NotImplementedError
+
+    def init_from_disk(self, data_dir: Path):
+        self.memory_stats["read"] = {}
+        with TrackMemoryIn(self.memory_stats["read"]):
+            self.read(data_dir)
+
+    @abstractmethod
+    @TimeableMixin.TimeAs
+    def read(self, fp: Path):
+        raise NotImplementedError
+
+    @classmethod
+    def _prep(cls, data: SAMPLE_DATASET_T, data_dir: Path) -> tuple[dict[str, Any], dict[str, timedelta]]:
+        """This should return keyword args to build a dataset leveraging the storage directory."""
+        memory_stats = {"build": {}, "write": {}}
+        with TrackMemoryIn(memory_stats["build"]):
+            st = datetime.now()
+            data_obj = cls._build(data)
+            build_time = datetime.now() - st
+
+        data_dir.mkdir(parents=True, exist_ok=True)
+        with TrackMemoryIn(memory_stats["write"]):
+            st = datetime.now()
+            cls.write(data_obj, data_dir)
+            write_time = datetime.now() - st
+
+        return {"data_dir": data_dir}, {"build": build_time, "write": write_time}, memory_stats
+
+    def __len__(self) -> int:
+        return self.N
+
+    @abstractmethod
+    @TimeableMixin.TimeAs
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    @abstractmethod
+    @TimeableMixin.TimeAs
+    def collate(self, batch: list[dict]) -> dict:
+        raise NotImplementedError
+
+    def dataloader(self, *args, **kwargs) -> DataLoader:
+        return DataLoader(self, *args, collate_fn=self.collate, **kwargs)
+
+    @staticmethod
+    def tensor_size(a: torch.Tensor) -> int:
+        return sys.getsizeof(a) + torch.numel(a) * a.element_size()
+
+    @TimeableMixin.TimeAs
+    def benchmark(self, batch_size: int, num_epochs: int = 1) -> tuple[dict[str, list[int]], list[timedelta]]:
+        torch.manual_seed(1)
+
+        dataloader = self.dataloader(batch_size=batch_size, shuffle=True)
+
+        sizes = defaultdict(list)
+        epoch_durations = []
+
+        self.memory_stats["benchmark"] = {}
+        with TrackMemoryIn(self.memory_stats["benchmark"]):
+            for epoch in range(num_epochs):
+                epoch_start = datetime.now()
+                for B in dataloader:
+                    for k, v in B.items():
+                        sizes[k].append(BenchmarkableDataset.tensor_size(v))
+                epoch_durations.append(datetime.now() - epoch_start)
+
+        return sizes, epoch_durations
diff --git a/benchmark/nrt_dataset.py b/benchmark/nrt_dataset.py
@@ -0,0 +1,78 @@
+import pickle
+from functools import cached_property
+from pathlib import Path
+
+import numpy as np
+import torch
+from mixins import TimeableMixin
+from torch.utils.data import default_collate
+
+from nested_ragged_tensors.ragged_numpy import JointNestedRaggedTensorDict
+from sample_dataset_builder import SAMPLE_DATASET_T
+
+from .benchmarkable_dataset import BenchmarkableDataset
+
+
+class NRTDataset(BenchmarkableDataset):
+    @classmethod
+    def _build(cls, data: SAMPLE_DATASET_T) -> tuple[dict, JointNestedRaggedTensorDict]:
+        static_data = None
+        static_keys = [k for k in data if k.startswith("static_")]
+        for k in static_keys:
+            S = data.pop(k)
+            if static_data is None:
+                static_data = [{} for _ in range(len(S))]
+            for i in range(len(S)):
+                static_data[i][k] = S[i]
+
+        return (static_data, JointNestedRaggedTensorDict(data))
+
+    @cached_property
+    def index(self):
+        if hasattr(self, "task_bounds") and self.task_bounds:
+            return self.task_bounds
+        else:
+            return [(i, None, None) for i in range(len(self.dynamic_data))]
+
+    @TimeableMixin.TimeAs
+    def read(self, read_dir: Path):
+        self.dynamic_data = JointNestedRaggedTensorDict.load(read_dir / "dynamics.nrt")
+        with open(read_dir / "static_data.pkl", "rb") as f:
+            self.static_data = pickle.load(f)
+        self.N = len(self.index)
+
+    @classmethod
+    def write(cls, data: tuple[dict, JointNestedRaggedTensorDict], data_dir: Path):
+        static_data, dynamic_data = data
+
+        dynamics_fp = data_dir / "dynamics.nrt"
+        dynamic_data.save(dynamics_fp)
+
+        static_data_fp = data_dir / "static_data.pkl"
+        with open(static_data_fp, "wb") as f:
+            pickle.dump(static_data, f)
+
+    @TimeableMixin.TimeAs
+    def __getitem__(self, i):
+        i, start, end = self.index[i]
+        dynamic_data = self.dynamic_data[i]
+        static_data = self.static_data[i]
+
+        if start is not None or end is not None:
+            dynamic_data = dynamic_data[start:end]
+
+        if self.max_seq_len is not None:
+            L = len(dynamic_data)
+            if L > self.max_seq_len:
+                start = np.random.randint(0, L - self.max_seq_len)
+                dynamic_data = dynamic_data[start : start + self.max_seq_len]
+
+        return (static_data, dynamic_data)
+
+    @TimeableMixin.TimeAs
+    def collate(self, batch: list[tuple[dict, JointNestedRaggedTensorDict]]) -> dict:
+        dynamics = [d for _, d in batch]
+        collated_dynamics = JointNestedRaggedTensorDict.vstack(dynamics).to_dense()
+        collated_dynamics = {k: torch.from_numpy(v) for k, v in collated_dynamics.items()}
+        collated_static_data = default_collate([s for s, _ in batch])
+        return {**collated_static_data, **collated_dynamics}