Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new readers and schemas for reduced data storage in db #437

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 1 addition & 20 deletions aeon/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from dotmap import DotMap

from aeon import util
from aeon.io.api import aeon as aeon_time
from aeon.io.api import chunk, chunk_key
from aeon.io.api import chunk_key
jkbhagatio marked this conversation as resolved.
Show resolved Hide resolved

_SECONDS_PER_TICK = 32e-6
_payloadtypes = {
Expand Down Expand Up @@ -187,24 +186,6 @@ class Encoder(Harp):
def __init__(self, pattern):
super().__init__(pattern, columns=["angle", "intensity"])

def read(self, file, downsample=True):
"""Reads encoder data from the specified Harp binary file.

By default the encoder data is downsampled to 50Hz. Setting downsample to
False or None can be used to force the raw data to be returned.
"""
data = super().read(file)
if downsample is True:
# resample requires a DatetimeIndex so we convert early
data.index = aeon_time(data.index)

first_index = data.first_valid_index()
if first_index is not None:
# since data is absolute angular position we decimate by taking first of each bin
chunk_origin = chunk(first_index)
data = data.resample("20ms", origin=chunk_origin).first()
return data


class Position(Harp):
"""Extract 2D position tracking data for a specific camera.
Expand Down
204 changes: 204 additions & 0 deletions aeon/schema/ingestion_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""Aeon experiment schemas for DataJoint database ingestion."""

from os import PathLike

import pandas as pd
from dotmap import DotMap

import aeon.schema.core as stream
from aeon.io import reader
from aeon.io.api import aeon as aeon_time
from aeon.schema import foraging, octagon, social_01, social_02, social_03
from aeon.schema.foraging import DepletionFunction, Feeder
from aeon.schema.streams import Device, Stream, StreamGroup


# Define new readers
class _Encoder(reader.Encoder):
"""A version of the encoder reader that can downsample the data."""

def read(self, file: PathLike[str], sr_hz: int = 50) -> pd.DataFrame:
"""Reads encoder data from the specified Harp binary file."""
data = super().read(file)
data.index = aeon_time(data.index)
first_index = data.first_valid_index()
if first_index is not None:
data = data.resample(f"{1/sr_hz}s").first() # take first sample in each resampled bin
return data
Comment on lines +20 to +27

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

The read method in the _Encoder class is downsampling the data by taking the first sample in each resampled bin. This approach might lead to loss of important information if the first sample is not representative of the entire bin. Consider using a different resampling strategy, such as averaging the samples in each bin, to preserve more information.

-        data = data.resample(f"{1/sr_hz}s").first()  # take first sample in each resampled bin
+        data = data.resample(f"{1/sr_hz}s").mean()  # take average of samples in each resampled bin



class _Video(reader.Video):
"""A version of the video reader that drops columns that can be recreated from data and metadata."""

def read(self, file: PathLike[str], drop_cols=None) -> pd.DataFrame:
"""Reads video metadata from the specified file."""
drop_cols = ["hw_counter", "_frame", "_path", "_epoch"] if drop_cols is None else drop_cols
data = pd.read_csv(file, header=0, names=self._rawcolumns)
data.drop(columns=drop_cols, errors="ignore", inplace=True)
data.set_index("time", inplace=True)
return data

Check warning on line 39 in aeon/schema/ingestion_schemas.py

View check run for this annotation

Codecov / codecov/patch

aeon/schema/ingestion_schemas.py#L35-L39

Added lines #L35 - L39 were not covered by tests
Comment on lines +33 to +39

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

In the read method of the _Video class, you are dropping columns from the DataFrame without checking if they exist. This could lead to errors if one or more of the specified columns are not present in the DataFrame. Consider adding a check to ensure that only existing columns are dropped.

-        data.drop(columns=drop_cols, errors="ignore", inplace=True)
+        drop_cols = [col for col in drop_cols if col in data.columns]
+        data.drop(columns=drop_cols, inplace=True)



# Define new streams and stream groups
class Video(Stream):
"""Video frame metadata."""

def __init__(self, pattern):
super().__init__(_Video(f"{pattern}_*"))


class Encoder(Stream):
"""Wheel magnetic encoder data."""

def __init__(self, pattern):
super().__init__(_Encoder(f"{pattern}_90_*"))
Comment on lines +46 to +54

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

The __init__ methods in the Video and Encoder classes are hardcoding the pattern for file matching. This reduces the flexibility of these classes and makes them less reusable. Consider passing the pattern as an argument to the __init__ method instead.

-    def __init__(self, pattern):
-        super().__init__(_Video(f"{pattern}_*"))
+    def __init__(self, pattern, file_pattern):
+        super().__init__(_Video(f"{pattern}_{file_pattern}"))

-    def __init__(self, pattern):
-        super().__init__(_Encoder(f"{pattern}_90_*"))
+    def __init__(self, pattern, file_pattern):
+        super().__init__(_Encoder(f"{pattern}_{file_pattern}"))



class Patch(StreamGroup):
"""Data streams for a patch."""

def __init__(self, pattern):
super().__init__(pattern, DepletionFunction, Encoder, Feeder)
jkbhagatio marked this conversation as resolved.
Show resolved Hide resolved


# Define schemas
octagon01 = DotMap(
[
Device("Metadata", stream.Metadata),
Device("CameraTop", Video, stream.Position),
Device("CameraColorTop", Video),
Device("ExperimentalMetadata", stream.SubjectState),
Device("Photodiode", octagon.Photodiode),
Device("OSC", octagon.OSC),
Device("TaskLogic", octagon.TaskLogic),
Device("Wall1", octagon.Wall),
Device("Wall2", octagon.Wall),
Device("Wall3", octagon.Wall),
Device("Wall4", octagon.Wall),
Device("Wall5", octagon.Wall),
Device("Wall6", octagon.Wall),
Device("Wall7", octagon.Wall),
Device("Wall8", octagon.Wall),
]
)

exp01 = DotMap(
[
Device("SessionData", foraging.SessionData),
Device("FrameTop", Video, stream.Position),
Device("FrameEast", Video),
Device("FrameGate", Video),
Device("FrameNorth", Video),
Device("FramePatch1", Video),
Device("FramePatch2", Video),
Device("FrameSouth", Video),
Device("FrameWest", Video),
Device("Patch1", foraging.DepletionFunction, stream.Encoder, foraging.Feeder),
Device("Patch2", foraging.DepletionFunction, stream.Encoder, foraging.Feeder),
]
)

exp02 = DotMap(
[
Device("Metadata", stream.Metadata),
Device("ExperimentalMetadata", stream.Environment, stream.MessageLog),
Device("CameraTop", Video, stream.Position, foraging.Region),
Device("CameraEast", Video),
Device("CameraNest", Video),
Device("CameraNorth", Video),
Device("CameraPatch1", Video),
Device("CameraPatch2", Video),
Device("CameraSouth", Video),
Device("CameraWest", Video),
Device("Nest", foraging.Weight),
Device("Patch1", Patch),
Device("Patch2", Patch),
]
)

social01 = DotMap(
[
Device("Metadata", stream.Metadata),
Device("Environment", social_02.Environment, social_02.SubjectData),
Device("CameraTop", Video, social_01.Pose),
Device("CameraNorth", Video),
Device("CameraSouth", Video),
Device("CameraEast", Video),
Device("CameraWest", Video),
Device("CameraPatch1", Video),
Device("CameraPatch2", Video),
Device("CameraPatch3", Video),
Device("CameraNest", Video),
Device("Nest", social_02.WeightRaw, social_02.WeightFiltered),
Device("Patch1", Patch),
Device("Patch2", Patch),
Device("Patch3", Patch),
Device("RfidGate", social_01.RfidEvents),
Device("RfidNest1", social_01.RfidEvents),
Device("RfidNest2", social_01.RfidEvents),
Device("RfidPatch1", social_01.RfidEvents),
Device("RfidPatch2", social_01.RfidEvents),
Device("RfidPatch3", social_01.RfidEvents),
]
)


social02 = DotMap(
[
Device("Metadata", stream.Metadata),
Device("Environment", social_02.Environment, social_02.SubjectData),
Device("CameraTop", Video, social_02.Pose),
Device("CameraNorth", Video),
Device("CameraSouth", Video),
Device("CameraEast", Video),
Device("CameraWest", Video),
Device("CameraPatch1", Video),
Device("CameraPatch2", Video),
Device("CameraPatch3", Video),
Device("CameraNest", Video),
Device("Nest", social_02.WeightRaw, social_02.WeightFiltered),
Device("Patch1", Patch),
Device("Patch2", Patch),
Device("Patch3", Patch),
Device("GateRfid", social_02.RfidEvents),
Device("NestRfid1", social_02.RfidEvents),
Device("NestRfid2", social_02.RfidEvents),
Device("Patch1Rfid", social_02.RfidEvents),
Device("Patch2Rfid", social_02.RfidEvents),
Device("Patch3Rfid", social_02.RfidEvents),
]
)


social03 = DotMap(
[
Device("Metadata", stream.Metadata),
Device("Environment", social_02.Environment, social_02.SubjectData),
Device("CameraTop", Video, social_03.Pose),
Device("CameraNorth", Video),
Device("CameraSouth", Video),
Device("CameraEast", Video),
Device("CameraWest", Video),
Device("CameraNest", Video),
Device("CameraPatch1", Video),
Device("CameraPatch2", Video),
Device("CameraPatch3", Video),
Device("Nest", social_02.WeightRaw, social_02.WeightFiltered),
Device("Patch1", Patch),
Device("Patch2", Patch),
Device("Patch3", Patch),
Device("PatchDummy1", Patch),
Device("NestRfid1", social_02.RfidEvents),
Device("NestRfid2", social_02.RfidEvents),
Device("GateRfid", social_02.RfidEvents),
Device("GateEastRfid", social_02.RfidEvents),
Device("GateWestRfid", social_02.RfidEvents),
Device("Patch1Rfid", social_02.RfidEvents),
Device("Patch2Rfid", social_02.RfidEvents),
Device("Patch3Rfid", social_02.RfidEvents),
Device("PatchDummy1Rfid", social_02.RfidEvents),
]
)


__all__ = ["octagon01", "exp01", "exp02", "social01", "social02", "social03"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

The __all__ variable is used to specify the public interface of a module. However, it's not clear from this code whether all the items listed in __all__ are intended to be part of the public interface. Please ensure that only those items that should be accessible when importing the module using from module import * are included in __all__.

-__all__ = ["octagon01", "exp01", "exp02", "social01", "social02", "social03"]
+__all__ = ["intended_public_interface_item1", "intended_public_interface_item2", ...]

15 changes: 8 additions & 7 deletions tests/io/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,25 @@
from pytest import mark

import aeon
from aeon.schema.ingestion_schemas import social03
from aeon.schema.schemas import exp02
Comment on lines +8 to 9

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

The import of social03 from aeon.schema.ingestion_schemas is not used in the code, which can lead to unnecessary overhead and confusion. Please remove unused imports to improve code readability and performance.

- from aeon.schema.ingestion_schemas import social03


nonmonotonic_path = Path(__file__).parent.parent / "data" / "nonmonotonic"
monotonic_path = Path(__file__).parent.parent / "data" / "monotonic"
nonmonotonic_path = Path(__file__).parent.parent / "data" / "nonmonotonic"


@mark.api
def test_load_start_only():
data = aeon.load(
nonmonotonic_path, exp02.Patch2.Encoder, start=pd.Timestamp("2022-06-06T13:00:49"), downsample=None
nonmonotonic_path, exp02.Patch2.Encoder, start=pd.Timestamp("2022-06-06T13:00:49")
)
Comment on lines +18 to 19

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

In the function test_load_start_only(), there's no error handling for the case when the data loading fails and returns an empty dataset. This could lead to a misleading test result. Consider adding an explicit check for this scenario.

    data = aeon.load(
       nonmonotonic_path, exp02.Patch2.Encoder, start=pd.Timestamp("2022-06-06T13:00:49")
    )
+    assert data is not None, "Data loading failed"
    assert len(data) > 0

assert len(data) > 0


@mark.api
def test_load_end_only():
data = aeon.load(
nonmonotonic_path, exp02.Patch2.Encoder, end=pd.Timestamp("2022-06-06T13:00:49"), downsample=None
nonmonotonic_path, exp02.Patch2.Encoder, end=pd.Timestamp("2022-06-06T13:00:49")
jkbhagatio marked this conversation as resolved.
Show resolved Hide resolved
)
Comment on lines +26 to 27

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

Similar to the previous comment, in the function test_load_end_only(), there's no error handling for the case when the data loading fails and returns an empty dataset. This could lead to a misleading test result. Consider adding an explicit check for this scenario.

    data = aeon.load(
       nonmonotonic_path, exp02.Patch2.Encoder, end=pd.Timestamp("2022-06-06T13:00:49")
    )
+    assert data is not None, "Data loading failed"
    assert len(data) > 0

assert len(data) > 0

Expand All @@ -35,22 +36,22 @@ def test_load_filter_nonchunked():

@mark.api
def test_load_monotonic():
data = aeon.load(monotonic_path, exp02.Patch2.Encoder, downsample=None)
data = aeon.load(monotonic_path, exp02.Patch2.Encoder)
assert len(data) > 0
assert data.index.is_monotonic_increasing


@mark.api
def test_load_nonmonotonic():
data = aeon.load(nonmonotonic_path, exp02.Patch2.Encoder, downsample=None)
data = aeon.load(nonmonotonic_path, exp02.Patch2.Encoder)
assert not data.index.is_monotonic_increasing


@mark.api
def test_load_encoder_with_downsampling():
DOWNSAMPLE_PERIOD = 0.02
data = aeon.load(monotonic_path, exp02.Patch2.Encoder, downsample=True)
raw_data = aeon.load(monotonic_path, exp02.Patch2.Encoder, downsample=None)
data = aeon.load(monotonic_path, social03.Patch2.Encoder)
raw_data = aeon.load(monotonic_path, exp02.Patch2.Encoder)
jkbhagatio marked this conversation as resolved.
Show resolved Hide resolved
Comment on lines 52 to +54

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image description CodeRabbit

In the function test_load_encoder_with_downsampling(), the variable DOWNSAMPLE_PERIOD is defined but not used in the subsequent code. This can lead to confusion and should be removed if not necessary.

- DOWNSAMPLE_PERIOD = 0.02

Also, similar to previous comments, there's no error handling for the case when the data loading fails and returns an empty dataset. This could lead to a misleading test result. Consider adding an explicit check for this scenario.

    data = aeon.load(monotonic_path, social03.Patch2.Encoder)
+    assert data is not None, "Data loading failed"
    raw_data = aeon.load(monotonic_path, exp02.Patch2.Encoder)
+    assert raw_data is not None, "Raw data loading failed"


# Check that the length of the downsampled data is less than the raw data
assert len(data) < len(raw_data)
Expand Down
Loading