Skip to content

Commit

Permalink
Add scripts for benchmarks using the current API (#306)
Browse files Browse the repository at this point in the history
* Add environment for running and generating benchmarks

Signed-off-by: Mihai Maruseac <[email protected]>

* Add generator for models

Signed-off-by: Mihai Maruseac <[email protected]>

* Write in chunks

Signed-off-by: Mihai Maruseac <[email protected]>

* Add matrix, expand description

Signed-off-by: Mihai Maruseac <[email protected]>

* Add script for serialization benchmark

Signed-off-by: Mihai Maruseac <[email protected]>

* Proper capitalization of help messages

Signed-off-by: Mihai Maruseac <[email protected]>

* Add benchmark runner

Signed-off-by: Mihai Maruseac <[email protected]>

* Use numpy to generate random data.

We go from

```
[...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000"
Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000
  Time (mean ± σ):     10.290 s ±  0.140 s    [User: 10.197 s, System: 0.092 s]
  Range (min … max):   10.149 s … 10.541 s    10 runs
```

to

```
[...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000" --show-output
Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000
  Time (mean ± σ):     381.1 ms ±  13.9 ms    [User: 512.9 ms, System: 633.1 ms]
  Range (min … max):   365.5 ms … 412.1 ms    10 runs
```

Signed-off-by: Mihai Maruseac <[email protected]>

* Fix typos

Signed-off-by: Mihai Maruseac <[email protected]>

* Document all functions

Signed-off-by: Mihai Maruseac <[email protected]>

* Handle review

Signed-off-by: Mihai Maruseac <[email protected]>

* Handle review

Signed-off-by: Mihai Maruseac <[email protected]>

* Use id but with comment

Signed-off-by: Mihai Maruseac <[email protected]>

* Undo de-indent added by editor

Signed-off-by: Mihai Maruseac <[email protected]>

---------

Signed-off-by: Mihai Maruseac <[email protected]>
  • Loading branch information
mihaimaruseac authored Sep 11, 2024
1 parent fcf9f67 commit 74dedf9
Show file tree
Hide file tree
Showing 3 changed files with 491 additions and 1 deletion.
227 changes: 227 additions & 0 deletions benchmarks/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Script for generating benchmark data."""

import argparse
import itertools
import pathlib

import numpy as np


def create_file_of_given_size(path: str, size: int) -> None:
"""Writes a random file at the given path with given size.
Args:
path: Path to a file to write to. Parents are created if needed.
size: Number of bytes to generate and write to file.
"""
file_path = pathlib.Path(path)
file_path.parent.mkdir(parents=True, exist_ok=True)
chunk_size = 8192
num_chunks = size // chunk_size

with file_path.open("wb") as f:
for _ in range(num_chunks):
s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes()
f.write(s)

if size % chunk_size != 0:
chunk_size = size % chunk_size
s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes()
f.write(s)


def generate_file_sizes(
total_size: int, count: int, weights: list[int] | None = None
) -> list[int]:
"""Generate file sizes splitting a total size into multiple files.
If weights is missing (or made of equal elements), the resulting files have
equal sizes. Otherwise, the sizes are proportional to the weights.
The weights are used in a cycle until all files are accounted for.
Args:
total_size: Total size to split into files.
count: Number of files to generate.
weights: Optional weights to use when splitting.
Returns:
The list of file sizes to generate.
"""
if weights is None:
weights = [1]

weights = list(itertools.islice(itertools.cycle(weights), count))
total_weight = sum(weights)
file_sizes = [int(total_size * w / total_weight) for w in weights]
file_sizes[-1] = total_size - sum(file_sizes[:-1])
return file_sizes


def generate_file(args: argparse.Namespace):
"""Generates a random model as a single file.
Args:
args: The arguments specifying the request.
"""
create_file_of_given_size(args.root, args.size)


def generate_dir(args: argparse.Namespace):
"""Generates a random model as N files in a directory.
Args:
args: The arguments specifying the request.
"""
for i, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)):
create_file_of_given_size(f"{args.root}/f{i}", sz)


def generate_matrix(args: argparse.Namespace):
"""Generates a random model as M directories with N files each.
Args:
args: The arguments specifying the request.
"""
sizes = generate_file_sizes(args.size // args.m, args.n, args.w)
exact = args.size % args.m == 0
last = args.m if exact else (args.m - 1)

for i in range(last):
for j, sz in enumerate(sizes):
create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz)

if not exact:
leftover = (args.size // args.m) + (args.size % args.m)
i = i + 1
for j, sz in enumerate(generate_file_sizes(leftover, args.n, args.w)):
create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz)


def generate_nested(args: argparse.Namespace):
"""Generates a random model as N files in a directory with M ancestors.
Args:
args: The arguments specifying the request.
"""
path = args.root
for i in range(args.m):
path = f"{path}/d{i}"

for j, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)):
create_file_of_given_size(f"{path}/f{j}", sz)


def add_size_arguments(
parser: argparse.ArgumentParser, multiple_files: bool = True
) -> None:
"""Adds the size related arguments to a subparser.
We need to pass in the size of the model to generate. If the model has
multiple files we support an additional repeated to specify what sizes these
files should have (instead of being all equal).
Args:
parser: The parser to enhance.
multiple_files: Whether the generator generates multiple files.
"""
parser.add_argument("size", help="size of the model", type=int)

if multiple_files:
parser.add_argument(
"-w",
help="optional weights for for model file sizes to generate",
nargs="+",
type=int,
)


def add_count_arguments(
parser: argparse.ArgumentParser, with_dirs: bool = True
) -> None:
"""Adds the count related arguments to a subparser.
We have N files. In some cases, we also have M directories.
Args:
parser: The parser to enhance.
with_dirs: Also add argument to generate the directories.
"""
parser.add_argument("-n", help="number of files", type=int, required=True)

if with_dirs:
parser.add_argument(
"-m", help="number of directories", type=int, required=True
)


def add_root_argument(parser: argparse.ArgumentParser) -> None:
"""Adds the argument for the name of the root of the model.
Args:
parser: The parser to enhance.
"""
parser.add_argument("--root", help="model root path", required=True)


def build_parser() -> argparse.ArgumentParser:
"""Builds the command line parser for the generator."""
parser = argparse.ArgumentParser(
description="generate benchmark data for model signing"
)
parser.set_defaults(func=generate_file)
subparsers = parser.add_subparsers(title="Model shapes")

parser_file = subparsers.add_parser(
"file", help="generate all data in a single file (default)"
)
add_root_argument(parser_file)
add_size_arguments(parser_file, multiple_files=False)
parser_file.set_defaults(func=generate_file)

parser_dir = subparsers.add_parser(
"dir", help="generate data split into N files in a single directory"
)
add_root_argument(parser_dir)
add_size_arguments(parser_dir)
add_count_arguments(parser_dir, with_dirs=False)
parser_dir.set_defaults(func=generate_dir)

parser_matrix = subparsers.add_parser(
"matrix", help="generate data split into N files in M directories"
)
add_root_argument(parser_matrix)
add_size_arguments(parser_matrix)
add_count_arguments(parser_matrix)
parser_matrix.set_defaults(func=generate_matrix)

parser_nested = subparsers.add_parser(
"nested",
help="generate data split into N files in a directory nested M levels",
)
add_root_argument(parser_nested)
add_size_arguments(parser_nested)
add_count_arguments(parser_nested)
parser_nested.set_defaults(func=generate_nested)

return parser


if __name__ == "__main__":
np.random.seed(42)
args = build_parser().parse_args()
args.func(args)
Loading

0 comments on commit 74dedf9

Please sign in to comment.