Skip to content

Commit

Permalink
Merge pull request #38 from int-brain-lab/io_dev
Browse files Browse the repository at this point in the history
io refactored, tests expanded, selection of different datasets possible
  • Loading branch information
grg2rsr authored Dec 5, 2024
2 parents 71c1e6b + 869e07b commit b1608b5
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 196 deletions.
288 changes: 136 additions & 152 deletions src/iblphotometry/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,103 @@
)


def from_array(
times: np.ndarray, data: np.ndarray, channel_names: list[str] | None = None
def from_raw_neurophotometrics_file_to_raw_df(
path: str | Path,
validate=True,
) -> pd.DataFrame:
return pd.DataFrame(data, index=times, columns=channel_names)
path = Path(path) if isinstance(path, str) else path
match path.suffix:
case '.csv':
raw_df = pd.read_csv(path)
case '.pqt':
raw_df = pd.read_parquet(path)

if validate:
raw_df = _validate_neurophotometrics_df(raw_df)

return raw_df


def from_raw_neurophotometrics_df_to_ibl_df(
raw_df: pd.DataFrame, rois=None, drop_first=True
) -> pd.DataFrame:
if rois is None:
rois = infer_data_columns(raw_df)

ibl_df = raw_df.filter(items=rois, axis=1).sort_index(axis=1)
timestamp_name = (
'SystemTimestamp' if 'SystemTimestamp' in raw_df.columns else 'Timestamp'
)
ibl_df['times'] = raw_df[timestamp_name]
ibl_df['wavelength'] = np.nan
ibl_df['name'] = ''
ibl_df['color'] = ''

# TODO the names column in channel_meta_map should actually be user defined (experiment description file?)
channel_meta_map = pd.DataFrame(LIGHT_SOURCE_MAP)
led_states = pd.DataFrame(LED_STATES).set_index('Condition')
states = raw_df['LedState']

for state in states.unique():
ir, ic = np.where(led_states == state)
# if not present, multiple LEDs are active
if ic.size == 0:
# find row
ir = np.argmax(led_states['No LED ON'] > state) - 1
# find active combo
possible_led_combos = [(1, 2), (1, 3), (2, 3), (1, 2, 3)]
for combo in possible_led_combos: # drop enumerate
if state == sum([led_states.iloc[ir, c] for c in combo]):
name = '+'.join([channel_meta_map['name'][c] for c in combo])
color = '+'.join([channel_meta_map['color'][c] for c in combo])
wavelength = np.nan
ibl_df.loc[states == state, ['name', 'color', 'wavelength']] = (
name,
color,
wavelength,
)
else:
for cn in ['name', 'color', 'wavelength']:
ibl_df.loc[states == state, cn] = channel_meta_map.iloc[ic[0]][cn]

# drop first frame
if drop_first:
ibl_df = ibl_df.iloc[1:].reset_index()

return ibl_df


def from_raw_neurophotometrics_file_to_ibl_df(
path: str | Path,
drop_first=True,
validate=True,
) -> pd.DataFrame:
raw_df = from_raw_neurophotometrics_file_to_raw_df(path, validate=validate)
ibl_df = from_raw_neurophotometrics_df_to_ibl_df(raw_df, drop_first=drop_first)

return ibl_df


def from_ibl_pqt_to_ibl_df(path: str | Path, validate=False):
if validate is True:
# TODO
raise NotImplementedError
return pd.read_parquet(path)


def from_ibl_dataframe(
raw_df: pd.DataFrame,
ibl_df: pd.DataFrame,
data_columns: list[str] | None = None,
time_column: str | None = None,
channel_column: str = 'name',
channel_names: list[str] | None = None,
rename: dict | None = None,
) -> dict:
"""reads in a pandas.DataFrame and converts it into nap.TsdDataframes. Performs the time demultiplexing operation.
"""main function to convert to analysis ready format
Args:
raw_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt
ibl_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt
data_columns (list[str], optional): The names of the columns in the dataframe that contain the signals of different fibers. By default, they are named RegionXX. If None is provided, All columns that start with `Region` are treated as data columns. Defaults to None.
time_column (str, optional): The name of the column that contains the timestamps. If None is provided, it is assumed that `time` is in the name. Defaults to None.
channel_column (str, optional): The name of the column that contains. Defaults to 'name'.
Expand All @@ -43,75 +121,46 @@ def from_ibl_dataframe(
# data_columns is a list of str that specifies the names of the column that hold the actual data, like 'RegionXX'
# channel_column is the column that specifies the temporally multiplexed acquisition channels

# infer if not explicitly provided: defaults to everything that starts with 'Region'
if data_columns is None:
# this hacky parser currently deals with the inconsistency between carolinas and alejandros extraction
# https://github.com/int-brain-lab/ibl-photometry/issues/35
data_columns = [
col
for col in raw_df.columns
if col.startswith('Region') or col.startswith('G')
]
data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns

# infer name of time column if not provided
if time_column is None:
time_columns = [col for col in raw_df.columns if 'time' in col.lower()]
time_columns = [col for col in ibl_df.columns if 'time' in col.lower()]
assert len(time_columns) == 1
time_column = time_columns[0]

# infer channel names if they are not explicitly provided
if channel_names is None:
channel_names = raw_df[channel_column].unique()
channel_names = ibl_df[channel_column].unique()

# drop empty acquisition channels
to_drop = ['None', '']
channel_names = [ch for ch in channel_names if ch not in to_drop]

raw_dfs = {}
dfs = {}
for channel in channel_names:
# get the data for the band
df = raw_df.groupby(channel_column).get_group(channel)
df = ibl_df.groupby(channel_column).get_group(channel)
# if rename dict is passed, rename Region0X to the corresponding brain region
if rename is not None:
df = df.rename(columns=rename)
data_columns = rename.values()
raw_dfs[channel] = df.set_index(time_column)[data_columns]

return raw_dfs
dfs[channel] = df.set_index(time_column)[data_columns]


def from_ibl_dataframes(raw_df: pd.DataFrame, locations_df: pd.DataFrame):
data_columns = (list(locations_df.index),)
rename = locations_df['brain_region'].to_dict()

read_config = dict(
data_columns=data_columns,
time_column='times',
channel_column='name',
rename=rename,
)

return from_ibl_dataframe(raw_df, **read_config)
return dfs


def from_ibl_pqt(
signal_pqt_path: str | Path,
locations_pqt_path: Optional[str | Path] = None,
):
"""reads in a photometry.signal.pqt files as they are registered in alyx.
# read from a single pqt
# if both are provided, do both

Args:
signal_pqt_path (str | Path): _description_
locations_pqt_path (str | Path, optional): _description_. Defaults to None.
Returns:
_type_: _description_
"""

raw_df = pd.read_parquet(signal_pqt_path)
ibl_df = pd.read_parquet(signal_pqt_path)
if locations_pqt_path is not None:
locations_df = pd.read_parquet(locations_pqt_path)
return from_ibl_dataframes(raw_df, locations_df)
return from_ibl_dataframes(ibl_df, locations_df)
else:
warnings.warn(
'loading a photometry.signal.pqt file without its corresponding photometryROI.locations.pqt'
Expand All @@ -126,135 +175,61 @@ def from_ibl_pqt(
rename=rename,
)

return from_ibl_dataframe(raw_df, **read_config)


def from_raw_neurophotometrics_ibl_df(
raw_df: pd.DataFrame, rois=None, drop_first=True
) -> pd.DataFrame:
"""reads in parses the output of the neurophotometrics FP3002
Args:
raw_df (pd.DataFrame): _description_
rois (_type_, optional): _description_. Defaults to None.
Returns:
pd.DataFrame: a dataframe in the same format as stored in alyx as pqt.
"""
if rois is None:
rois = [col for col in raw_df.columns if col.startswith('G')]

df = raw_df.filter(items=rois, axis=1).sort_index(axis=1)
timestamp_name = (
'SystemTimestamp' if 'SystemTimestamp' in raw_df.columns else 'Timestamp'
)
df['times'] = raw_df[timestamp_name]
df['wavelength'] = np.nan
df['name'] = ''
df['color'] = ''

# TODO the names column in channel_meta_map should actually be user defined (experiment description file?)
channel_meta_map = pd.DataFrame(LIGHT_SOURCE_MAP)
led_states = pd.DataFrame(LED_STATES).set_index('Condition')
states = raw_df['LedState']
return from_ibl_dataframe(ibl_df, **read_config)

for state in states.unique():
ir, ic = np.where(led_states == state)
# if not present, multiple LEDs are active
if ic.size == 0:
# find row
ir = np.argmax(led_states['No LED ON'] > state) - 1
# find active combo
possible_led_combos = [(1, 2), (1, 3), (2, 3), (1, 2, 3)]
for combo in possible_led_combos: # drop enumerate
if state == sum([led_states.iloc[ir, c] for c in combo]):
name = '+'.join([channel_meta_map['name'][c] for c in combo])
color = '+'.join([channel_meta_map['color'][c] for c in combo])
wavelength = np.nan
df.loc[states == state, ['name', 'color', 'wavelength']] = (
name,
color,
wavelength,
)
else:
for cn in ['name', 'color', 'wavelength']:
df.loc[states == state, cn] = channel_meta_map.iloc[ic[0]][cn]

# drop first frame
if drop_first:
df = df.iloc[1:].reset_index()

return df


def from_raw_neurophotometrics_file_to_ibl_df(
path: str | Path,
drop_first=True,
validate=True,
) -> pd.DataFrame:
"""reads a raw neurophotometrics file (in .csv or .pqt format) as they are written by the neurophotometrics software
Args:
path (str | Path): path to either the .csv file as written by the neurophotometrics bonsai workflow, or a path to a .pqt file as stored in alyx
drop_first (bool, optional): The first frame is all LEDs on. If true, this frame is dropped. Defaults to True.
validate (bool, optional): if true, enforces pydantic validation of the datatypes. Defaults to TRue
Raises:
NotImplementedError: _description_
def from_ibl_dataframes(ibl_df: pd.DataFrame, locations_df: pd.DataFrame):
# if locations are present
data_columns = (list(locations_df.index),)
rename = locations_df['brain_region'].to_dict()

Returns:
nap.TsdFrame: _description_ # FIXME
"""
warnings.warn(
'loading photometry from raw neurophotometrics output. The data will _not_ be synced and is being split into channels by LedState (converted to LED wavelength in nm)'
read_config = dict(
data_columns=data_columns,
time_column='times',
channel_column='name',
rename=rename,
)
if isinstance(path, str):
path = Path(path)
if path.suffix == '.csv':
# really raw as it comes out of the device
# todo figure out the header
raw_df = pd.read_csv(path)
elif path.suffix == '.pqt':
# as it is stored
raw_df = pd.read_parquet(path)
else:
raise NotImplementedError

if validate:
raw_df = _validate_ibl_dataframe(raw_df)

df = from_raw_neurophotometrics_ibl_df(raw_df)

# drop first frame
if drop_first:
df = df.iloc[1:].reset_index()

return df
return from_ibl_dataframe(ibl_df, **read_config)


def from_raw_neurophotometrics_file(
path: str | Path,
drop_first=True,
validate=True,
) -> dict:
df = from_raw_neurophotometrics_file_to_ibl_df(
# this one bypasses everything
ibl_df = from_raw_neurophotometrics_file_to_ibl_df(
path, drop_first=drop_first, validate=validate
)
data_columns = [col for col in df.columns if col.startswith('G')]
# data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns
read_config = dict(
data_columns=data_columns,
# data_columns=data_columns,
time_column='times',
channel_column='name',
)
return from_ibl_dataframe(df, **read_config)
return from_ibl_dataframe(ibl_df, **read_config)


"""
## ## ### ## #### ######## ### ######## #### ####### ## ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## ### ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## #### ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## ## ######### ## ## ## ## ######### ## ## ## ## ## ####
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ###
### ## ## ######## #### ######## ## ## ## #### ####### ## ##
"""


def _validate_ibl_dataframe(
def validate_ibl_dataframe(df: pd.DataFrame) -> pd.DataFrame: ...


def _validate_neurophotometrics_df(
df: pd.DataFrame,
data_columns=None,
) -> pd.DataFrame:
if data_columns is None:
data_columns = [col for col in df.columns if col.startswith('G')]
data_columns = infer_data_columns(df) if data_columns is None else data_columns

schema_raw_data = pandera.DataFrameSchema(
columns=dict(
Expand All @@ -280,3 +255,12 @@ def _validate_neurophotometrics_digital_inputs(df: pd.DataFrame) -> pd.DataFrame
)
)
return schema_digital_inputs.validate(df)


def infer_data_columns(df: pd.DataFrame) -> list[str]:
# this hacky parser currently deals with the inconsistency between carolinas and alejandros extraction
# https://github.com/int-brain-lab/ibl-photometry/issues/35
data_columns = [
col for col in df.columns if col.startswith('Region') or col.startswith('G')
]
return data_columns
Loading

0 comments on commit b1608b5

Please sign in to comment.