From d922975529d2266fbb2f26442d08e539ce24d002 Mon Sep 17 00:00:00 2001 From: Alexander Druz Date: Tue, 26 Sep 2023 15:16:17 +0200 Subject: [PATCH] Test HF dataset in integration tests --- .../core/huggingface_datasource.py | 17 +- scripts/create_hf_dataset.py | 161 ------------------ tests/integration/huggingface/__init__.py | 0 tests/integration/huggingface/conftest.py | 17 ++ tests/integration/huggingface/dataset.py | 146 ++++++++++++++++ tests/integration/huggingface/test_hf.py | 37 ++++ 6 files changed, 214 insertions(+), 164 deletions(-) delete mode 100755 scripts/create_hf_dataset.py create mode 100644 tests/integration/huggingface/__init__.py create mode 100644 tests/integration/huggingface/conftest.py create mode 100644 tests/integration/huggingface/dataset.py create mode 100644 tests/integration/huggingface/test_hf.py diff --git a/renumics/spotlight_plugins/core/huggingface_datasource.py b/renumics/spotlight_plugins/core/huggingface_datasource.py index 0206160c..14e921bf 100644 --- a/renumics/spotlight_plugins/core/huggingface_datasource.py +++ b/renumics/spotlight_plugins/core/huggingface_datasource.py @@ -161,6 +161,15 @@ def get_column_values( if isinstance(feature, datasets.Translation): return np.array([str(value) for value in raw_values]) + if isinstance(feature, datasets.Value): + hf_dtype = feature.dtype + if hf_dtype.startswith("duration"): + return raw_values.to_numpy().astype(int) + if hf_dtype.startswith("time32") or hf_dtype.startswith("time64"): + return raw_values.to_numpy().astype(str) + if hf_dtype.startswith("timestamp[ns"): + return raw_values.to_numpy().astype(int) + return raw_values.to_numpy() def get_column_metadata(self, _: str) -> ColumnMetadata: @@ -187,17 +196,19 @@ def _get_intermediate_dtype(feature: _FeatureType) -> DType: elif hf_dtype.startswith("float"): return float_dtype elif hf_dtype.startswith("time32"): - return datetime_dtype + return str_dtype elif hf_dtype.startswith("time64"): - return datetime_dtype + return str_dtype elif hf_dtype.startswith("timestamp"): + if hf_dtype.startswith("timestamp[ns"): + return int_dtype return datetime_dtype elif hf_dtype.startswith("date32"): return datetime_dtype elif hf_dtype.startswith("date64"): return datetime_dtype elif hf_dtype.startswith("duration"): - return float_dtype + return int_dtype elif hf_dtype.startswith("decimal"): return float_dtype elif hf_dtype == "binary": diff --git a/scripts/create_hf_dataset.py b/scripts/create_hf_dataset.py deleted file mode 100755 index db3006fa..00000000 --- a/scripts/create_hf_dataset.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script creates multimodal Hugging Face dataset to test Spotlight on. -""" - -import datasets -import numpy as np -from renumics import spotlight - -from renumics.spotlight import dtypes - - -def random_values( - dtype: dtypes.DType, num_rows: int, optional: bool = False -) -> np.ndarray: - if dtypes.is_bool_dtype(dtype): - values = np.random.randint(0, 2, num_rows, bool) - elif dtypes.is_int_dtype(dtype): - values = np.random.randint(0, 2, num_rows, bool) - elif dtypes.is_float_dtype(dtype): - values = np.random.normal(0, 100, num_rows) - # elif dtypes.is_str_dtype(dtype): - # str_lengths = np.random.randint(0, 100, num_rows) - # null_indices = np.random.randint(0, num_rows, num_rows // 10) - # str_lengths[null_indices] = 0 - # all_letters = np.array( - # list(string.ascii_letters + string.digits + string.punctuation) - # ) - else: - raise NotImplementedError - - if not optional: - return values - - null_indices = np.random.randint(0, num_rows, num_rows // 10) - if np.issubdtype(values.dtype, np.floating): - values[null_indices] = np.nan - else: - values = values.astype(object) - values[null_indices] = None - return values - - -def create_hf_dataset(num_rows: int) -> None: - ds = datasets.Dataset.from_dict( - { - "bool": [True, False, False], - "int": [-1, 1, 100000], - "uint": [1, 1, 30000], - "float": [1.0, float("nan"), 1000], - "string": ["foo", "barbaz", ""], - "label": ["foo", "bar", "foo"], - # "int": random_values(dtypes.int_dtype, num_rows), - # "float": random_values(dtypes.float_dtype, num_rows), - "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], - # HF sequence as Spotlight sequence - "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, float("nan"), -4, 10]], - "sequence_2d": [ - [[1, 2, 3, 4], [-1, 3, 1, 6]], - [[1, -3, 10], [1, 6, 3]], - [[-10, 0, 10], [-1, -2, -3]], - ], - "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], - # HF sequence as Spotlight array - "sequence_2d_array": [ - [[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]], - [[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]], - [[-10, 0, 10], [-1, -2, -3], [1, 2, 4]], - ], - "sequence_3d_array": [ - [[[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]]], - [[[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]]], - [[[-10, 0, 10], [-1, -2, -3], [1, 2, 4]]], - ], - # HF 2D array as Spotlight sequence - "array_2d_sequence": [ - [[1, 2, 3], [-1, 3, 1]], - [[1, -3, 10], [1, 6, 3]], - [[-10, 0, 10], [-1, -2, -3]], - ], - "array_2d_t_sequence": [ - [[5, 3], [2, 5], [10, 8]], - [[float("nan"), 1], [1, 1], [2, 2]], - [[-1, 1], [1, 10], [10, 1]], - ], - "array_2d_vlen_sequence": [ - [[5, 3], [2, 5], [10, 8]], - [], - [[-1, 1], [1, 10]], - ], - # HF 4D array as Spotlight array - "array_4d": [ - [[[[1.0, 1.0, -10.0]]], [[[-1.0, 1.0, -1.0]]], [[[2.0, 1.0, 1.0]]]], - [ - [[[2.0, -3.0, 0.0]]], - [[[3.0, 6.0, -2.0]]], - [[[4.0, float("nan"), 2.0]]], - [[[4.0, float("nan"), 2.0]]], - ], - [[[[3.0, 10.0, 10.0]]], [[[6.0, 3.0, -3.0]]], [[[4.0, 4.0, 4.0]]]], - ], - # HF list as Spotlight embedding - "list_sequence": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], - }, - features=datasets.Features( - { - "bool": datasets.Value("bool"), - "int": datasets.Value("int32"), - "uint": datasets.Value("uint16"), - "float": datasets.Value("float64"), - "string": datasets.Value("string"), - "label": datasets.ClassLabel( - num_classes=4, names=["foo", "bar", "baz", "barbaz"] - ), - "embedding": datasets.Sequence( - feature=datasets.Value("float64"), length=4 - ), - "sequence_1d": datasets.Sequence(feature=datasets.Value("float64")), - "sequence_2d": datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=2, - ), - "sequence_2d_t": datasets.Sequence( - feature=datasets.Sequence( - feature=datasets.Value("float64"), length=2 - ), - ), - "sequence_2d_array": datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=3, - ), - "sequence_3d_array": datasets.Sequence( - feature=datasets.Sequence( - feature=datasets.Sequence(feature=datasets.Value("float64")), - length=3, - ), - length=1, - ), - "array_2d_sequence": datasets.Array2D(shape=(2, 3), dtype="float64"), - "array_2d_t_sequence": datasets.Array2D(shape=(3, 2), dtype="float64"), - "array_2d_vlen_sequence": datasets.Array2D( - shape=(None, 2), dtype="float64" - ), - "array_4d": datasets.Array4D(shape=(None, 1, 1, 3), dtype="float64"), - "list_sequence": [datasets.Value("float64")], - } - ), - # info=datasets.DatasetInfo(), - # split=datasets.NamedSplit, - ) - ds.save_to_disk("./build/datasets/hf") - print(ds.features) - - -if __name__ == "__main__": - np.random.seed(42) - create_hf_dataset(100) - - ds = datasets.load_from_disk("./build/datasets/hf") - spotlight.show(ds) diff --git a/tests/integration/huggingface/__init__.py b/tests/integration/huggingface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/huggingface/conftest.py b/tests/integration/huggingface/conftest.py new file mode 100644 index 00000000..5f6f089b --- /dev/null +++ b/tests/integration/huggingface/conftest.py @@ -0,0 +1,17 @@ +""" +Pytest Fixtures for Hugging Face tests +""" + +import datasets +import pytest + +from .dataset import create_hf_dataset + + +@pytest.fixture +def dataset() -> datasets.Dataset: + """ + H5 Dataset for tests + """ + + return create_hf_dataset() diff --git a/tests/integration/huggingface/dataset.py b/tests/integration/huggingface/dataset.py new file mode 100644 index 00000000..3b81abb6 --- /dev/null +++ b/tests/integration/huggingface/dataset.py @@ -0,0 +1,146 @@ +""" +Data for Hugging Face tests +""" + +import datetime + +import datasets + + +DATA = { + "bool": [True, False, False], + "int": [-1, 1, 100000], + "uint": [1, 1, 30000], + "float": [1.0, float("nan"), 1000], + "string": ["foo", "barbaz", ""], + "label": ["foo", "bar", "foo"], + "binary": [b"foo", b"bar", b""], + "duration": [-1, 2, 10], + "decimal": [1.0, 3.0, 1000], + "date": [datetime.date.min, datetime.date(2001, 2, 15), datetime.date.max], + "time": [ + datetime.time.min, + datetime.time(14, 24, 15, 2672), + datetime.time.max, + ], + "timestamp": [ + datetime.datetime(1970, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2001, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2170, 2, 15, 14, 24, 15, 2672), + ], + "timestamp_ns": [ + datetime.datetime(1970, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2001, 2, 15, 14, 24, 15, 2672), + datetime.datetime(2170, 2, 15, 14, 24, 15, 2672), + ], + "embedding": [[1, 2, 3, 4], [1, 6, 3, 7], [-1, -2, -3, -4]], + "audio": [ + "data/audio/mono/gs-16b-1c-44100hz.mp3", + "data/audio/1.wav", + "data/audio/stereo/gs-16b-2c-44100hz.ogg", + ], + "image": [ + "data/images/nature-256p.ico", + "data/images/sea-360p.gif", + "data/images/nature-360p.jpg", + ], + # HF sequence as Spotlight sequence + "sequence_1d": [[1, 2, 3, 4], [1, 6, 3], [-1, -2, float("nan"), -4, 10]], + "sequence_2d": [ + [[1, 2, 3, 4], [-1, 3, 1, 6]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "sequence_2d_t": [[[5, 3], [2, 5], [10, 8]], [], [[-1, 1], [1, 10]]], + # HF sequence as Spotlight array + "sequence_2d_array": [ + [[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]], + [[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]], + [[-10, 0, 10], [-1, -2, -3], [1, 2, 4]], + ], + "sequence_3d_array": [ + [[[1, 2, 3, 4], [-1, 3, 1, 6], [1, 2, 4, 4]]], + [[[1, -3, 10], [1, 6, 3], [1, float("nan"), 4]]], + [[[-10, 0, 10], [-1, -2, -3], [1, 2, 4]]], + ], + # HF 2D array as Spotlight sequence + "array_2d_sequence": [ + [[1, 2, 3], [-1, 3, 1]], + [[1, -3, 10], [1, 6, 3]], + [[-10, 0, 10], [-1, -2, -3]], + ], + "array_2d_t_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [[float("nan"), 1], [1, 1], [2, 2]], + [[-1, 1], [1, 10], [10, 1]], + ], + "array_2d_vlen_sequence": [ + [[5, 3], [2, 5], [10, 8]], + [], + [[-1, 1], [1, 10]], + ], + # HF 4D array as Spotlight array + "array_4d": [ + [[[[1.0, 1.0, -10.0]]], [[[-1.0, 1.0, -1.0]]], [[[2.0, 1.0, 1.0]]]], + [ + [[[2.0, -3.0, 0.0]]], + [[[3.0, 6.0, -2.0]]], + [[[4.0, float("nan"), 2.0]]], + [[[4.0, float("nan"), 2.0]]], + ], + [[[[3.0, 10.0, 10.0]]], [[[6.0, 3.0, -3.0]]], [[[4.0, 4.0, 4.0]]]], + ], + # HF list as Spotlight embedding + "list_sequence": [[1, 2, 3], [1, 6, 3, 7, 8], [-1, -2, -3, -4]], +} + +FEATURES = { + "bool": datasets.Value("bool"), + "int": datasets.Value("int32"), + "uint": datasets.Value("uint16"), + "float": datasets.Value("float64"), + "string": datasets.Value("string"), + "label": datasets.ClassLabel(num_classes=4, names=["foo", "bar", "baz", "barbaz"]), + "binary": datasets.Value("binary"), + "duration": datasets.Value("duration[s]"), + "decimal": datasets.Value("decimal128(10, 2)"), + "date": datasets.Value("date32"), + "time": datasets.Value("time64[us]"), + "timestamp": datasets.Value("timestamp[us]"), + "timestamp_ns": datasets.Value("timestamp[ns]"), + "audio": datasets.Audio(), + "image": datasets.Image(), + "embedding": datasets.Sequence(feature=datasets.Value("float64"), length=4), + "sequence_1d": datasets.Sequence(feature=datasets.Value("float64")), + "sequence_2d": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=2, + ), + "sequence_2d_t": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64"), length=2), + ), + "sequence_2d_array": datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + "sequence_3d_array": datasets.Sequence( + feature=datasets.Sequence( + feature=datasets.Sequence(feature=datasets.Value("float64")), + length=3, + ), + length=1, + ), + "array_2d_sequence": datasets.Array2D(shape=(2, 3), dtype="float64"), + "array_2d_t_sequence": datasets.Array2D(shape=(3, 2), dtype="float64"), + "array_2d_vlen_sequence": datasets.Array2D(shape=(None, 2), dtype="float64"), + "array_4d": datasets.Array4D(shape=(None, 1, 1, 3), dtype="float64"), + "list_sequence": [datasets.Value("float64")], +} + + +def create_hf_dataset() -> datasets.Dataset: + ds = datasets.Dataset.from_dict( + DATA, + features=datasets.Features(FEATURES), + ) + return ds diff --git a/tests/integration/huggingface/test_hf.py b/tests/integration/huggingface/test_hf.py new file mode 100644 index 00000000..9380380f --- /dev/null +++ b/tests/integration/huggingface/test_hf.py @@ -0,0 +1,37 @@ +""" +Integration Test on API level for h5 data sources +""" +import pytest +import httpx + +import datasets + +from renumics import spotlight + +from .dataset import DATA + + +def test_get_table_returns_http_ok(dataset: datasets.Dataset) -> None: + """ + Ensure /api/table/ returns a valid response + """ + viewer = spotlight.show(dataset, no_browser=True, wait=False) + response = httpx.Client(base_url=viewer.url).get("/api/table/") + viewer.close() + assert response.status_code == 200 + + +@pytest.mark.parametrize("col", DATA.keys()) +def test_get_cell_returns_http_ok(dataset: str, col: str) -> None: + """ + Serve h5 dataset and get cell data for dtype + """ + viewer = spotlight.show(dataset, no_browser=True, wait=False) + gen_id = ( + httpx.Client(base_url=viewer.url).get("/api/table/").json()["generation_id"] + ) + response = httpx.Client(base_url=viewer.url).get( + f"/api/table/{col}/0?generation_id={gen_id}" + ) + viewer.close() + assert response.status_code == 200