Skip to content

Commit

Permalink
Merge pull request #30 from mmcdermott/21_split_later
Browse files Browse the repository at this point in the history
Made splitting only happen at densification.
  • Loading branch information
mmcdermott authored Sep 12, 2024
2 parents 1b41e1b + f2e0466 commit 3f0ed97
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 48 deletions.
9 changes: 3 additions & 6 deletions .github/workflows/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,9 @@ on:
pull_request:
branches: [main, "release/*", "dev"]

permissions:
contents: write
deployments: write

jobs:
benchmark:
permissions: write-all
name: Run benchmark
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -38,6 +35,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
auto-push: true
# Show alert with commit comment on detecting possible performance regression
alert-threshold: "200%"
alert-threshold: "150%"
comment-on-alert: true
fail-on-alert: true
fail-on-alert: false
82 changes: 40 additions & 42 deletions src/nested_ragged_tensors/ragged_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ def __init__(
>>> print(J) # doctest: +NORMALIZE_WHITESPACE
JointNestedRaggedTensorDict({'dim1/lengths': array([3, 2]),
'dim1/bounds': array([3, 5]),
'dim1/A': [array([1, 2, 3], dtype=uint8),
array([4, 5], dtype=uint8)],
'dim1/A': array([1, 2, 3, 4, 5], dtype=uint8),
'dim0/B': array([1, 2], dtype=uint8)},
schema={'A': <class 'numpy.uint8'>, 'B': <class 'numpy.uint8'>},
pre_raggedified=True)
Expand Down Expand Up @@ -253,10 +252,9 @@ def _initialize_tensors(self, tensors: dict[str, list[NESTED_NUM_LIST] | NESTED_
except TypeError as e:
raise ValueError(f"Failed to parse {k} as a nested list of numbers!") from e

flat_vals = list(itertools.chain.from_iterable(vals))
if k not in self.schema:
self.schema[k] = self._infer_dtype(list(itertools.chain.from_iterable(vals)))

vals = [np.array(v, dtype=self.schema[k]) for v in vals]
self.schema[k] = self._infer_dtype(flat_vals)

dim_str = "dim0"
for i, L in enumerate(lengths):
Expand All @@ -270,7 +268,7 @@ def _initialize_tensors(self, tensors: dict[str, list[NESTED_NUM_LIST] | NESTED_
self.tensors[lengths_key] = L
self.tensors[f"{dim_str}/bounds"] = np.cumsum(L, axis=0)

self.tensors[f"{dim_str}/{k}"] = vals
self.tensors[f"{dim_str}/{k}"] = np.array(flat_vals, dtype=self.schema[k])

def save(self, fp: Path):
"""Saves the tensor to a file. See `JointNestedRaggedTensorDict.load` for examples.
Expand Down Expand Up @@ -319,16 +317,9 @@ def load(cls, fp: Path) -> JointNestedRaggedTensorDict:
tensors = {}
schema = {}
for k, v in flat_vals_tensors.items():
if cls._is_meta_key(k):
tensors[k] = v
else:
tensors[k] = v
if not cls._is_meta_key(k):
schema[k] = v.dtype
dim_str = k.split("/")[0]
if dim_str == "dim0":
tensors[k] = v
else:
bounds = flat_vals_tensors[f"{dim_str}/bounds"]
tensors[k] = np.split(v, bounds[:-1])

return cls(tensors, schema=schema, pre_raggedified=True)

Expand Down Expand Up @@ -531,10 +522,7 @@ def __getitem__(self, idx: int | slice | np.ndarray):
continue

new_key = f"dim{dim_int - 1}/{key}"
if dim_int == 1:
out_tensors[new_key] = T[0]
else:
out_tensors[new_key] = T
out_tensors[new_key] = T

return self.__class__(out_tensors, schema=self.schema, pre_raggedified=True)
case slice() as S:
Expand All @@ -552,20 +540,28 @@ def __getitem__(self, idx: int | slice | np.ndarray):
L = self.tensors[f"dim{dim}/lengths"]
out_tensors[f"dim{dim}/lengths"] = L[st_i:end_i]

for key in self.keys_at_dim(dim):
out_tensors[f"dim{dim}/{key}"] = self.tensors[f"dim{dim}/{key}"][st_i:end_i]

B = self.tensors[f"dim{dim}/bounds"]

if st_i == 0:
offset = 0
else:
offset = B[st_i - 1]

out_tensors[f"dim{dim}/bounds"] = B[st_i:end_i] - offset
B = B[st_i:end_i] - offset

st_i = 0 if st_i == 0 else B[st_i - 1]
end_i = B[end_i - 1] if end_i is not None else B[-1]
out_tensors[f"dim{dim}/bounds"] = B

vals_start = offset
if len(B) == 0:
vals_end = offset
else:
vals_end = B[-1] + offset

for key in self.keys_at_dim(dim):
out_tensors[f"dim{dim}/{key}"] = self.tensors[f"dim{dim}/{key}"][vals_start:vals_end]

st_i = offset
end_i = vals_end

return JointNestedRaggedTensorDict(out_tensors, schema=self.schema, pre_raggedified=True)
case _:
Expand Down Expand Up @@ -662,6 +658,7 @@ def to_dense(self, padding_side: str = "right") -> dict[str, np.array]:
...
ValueError: padding_side must be 'left' or 'right'; got 'up'
"""

out = {key: self.tensors[f"dim0/{key}"] for key in self.keys_at_dim(0)}

shape = [len(self)]
Expand Down Expand Up @@ -705,14 +702,16 @@ def pad_slice(ln: int, max_ln: int) -> slice:
for idx, ln in zip(indices, L):
out[f"dim{dim}/mask"][idx + (pad_slice(ln, max_ln),)] = True

bounds = self.tensors[f"dim{dim}/bounds"]
for key in self.keys_at_dim(dim):
slice_vals = self.tensors[f"dim{dim}/{key}"]
if not slice_vals:
if len(self.tensors[f"dim{dim}/{key}"]) == 0:
continue

out[key] = np.zeros(shape=tuple(shape), dtype=slice_vals[0].dtype)
for idx, ln, vs in zip(indices, L, slice_vals):
out[key][idx + (pad_slice(ln, max_ln),)] = vs
out[key] = np.zeros(shape=tuple(shape), dtype=self.tensors[f"dim{dim}/{key}"].dtype)
st = 0
for idx, ln, b in zip(indices, L, bounds):
out[key][idx + (pad_slice(ln, max_ln),)] = self.tensors[f"dim{dim}/{key}"][st:b]
st = b

return out

Expand Down Expand Up @@ -761,7 +760,7 @@ def unsqueeze(self, dim: int) -> JointNestedRaggedTensorDict:
out_tensors = {}

for key in self.keys_at_dim(0):
out_tensors[f"dim1/{key}"] = [self.tensors[f"dim0/{key}"]]
out_tensors[f"dim1/{key}"] = self.tensors[f"dim0/{key}"]

if self.keys_at_dim(0):
lengths = np.array([len(self.tensors[f"dim0/{key}"])])
Expand Down Expand Up @@ -981,16 +980,15 @@ def concatenate(cls, tensors: list) -> JointNestedRaggedTensorDict:
out_tensors[bounds_key] = np.concatenate(
(out_tensors[bounds_key], T.tensors[bounds_key] + last_bound)
)

for key in out_keys_at_dim[dim]:
out_tensors[f"dim{dim}/{key}"] = (
out_tensors[f"dim{dim}/{key}"] + T.tensors[f"dim{dim}/{key}"]
)
elif dim == 0:
for key in out_keys_at_dim[dim]:
out_tensors[f"dim{dim}/{key}"] = np.concatenate(
(out_tensors[f"dim{dim}/{key}"], T.tensors[f"dim{dim}/{key}"]), axis=0
)
for key in out_keys_at_dim[dim]:
k_str = f"dim{dim}/{key}"
try:
out_tensors[k_str] = np.concatenate((out_tensors[k_str], T.tensors[k_str]), axis=0)
except Exception as e:
raise ValueError(
f"Failed to concatenate {key} at dim {dim} with args "
f"{out_tensors[k_str]} and {T.tensors[k_str]}"
) from e
return cls(out_tensors, pre_raggedified=True, schema=out_schema)

@classmethod
Expand Down Expand Up @@ -1104,7 +1102,7 @@ def load_slice(cls, fp: Path, idx: int | slice | np.ndarray) -> JointNestedRagge
for k in keys_by_dim[dim]:
v = f.get_slice(k)[vals_start:vals_end]
schema[k] = v.dtype
tensors[k] = np.split(v, B[:-1])
tensors[k] = v # np.split(v, B[:-1])

st_i = 0 if st_i == 0 else offset
end_i = B[-1] + offset
Expand Down

0 comments on commit 3f0ed97

Please sign in to comment.