From c07ccd4471e337c548ef49ca1437d59187f7515d Mon Sep 17 00:00:00 2001 From: Shailaja Akella Date: Wed, 11 Dec 2024 18:08:42 -0800 Subject: [PATCH] renamed io_utils --- src/dynamic_routing_analysis/io_utils.py | 846 +++++++++++++++++++++++ 1 file changed, 846 insertions(+) create mode 100644 src/dynamic_routing_analysis/io_utils.py diff --git a/src/dynamic_routing_analysis/io_utils.py b/src/dynamic_routing_analysis/io_utils.py new file mode 100644 index 0000000..7ecc61a --- /dev/null +++ b/src/dynamic_routing_analysis/io_utils.py @@ -0,0 +1,846 @@ +import logging +import time + +import npc_lims +import npc_sessions +import numpy as np +import pandas as pd +import xarray as xr +from tqdm import tqdm + +logger = logging.getLogger(__name__) # debug < info < warning < error + +pd.set_option('display.max_columns', None) + + +def get_data_from_npc_sessions(session_id): + """Fetch data from DynamicRoutingSession if files are not found.""" + try: + session = npc_sessions.DynamicRoutingSession(session_id) + trials = session.trials[:] + dprimes = np.array(session.performance.cross_modal_dprime[:]) + epoch = session.epochs[:] + behavior_info = {'trials': trials, + 'dprime': dprimes, + 'is_good_behavior': np.count_nonzero(dprimes >= 1) >= 4, + 'epoch_info': epoch} + units_table = session.units[:] + return session, units_table, behavior_info, None + except Exception as e: + raise FileNotFoundError(f"Failed to load data from DynamicRoutingSession: {e}") + + +def get_session_data(session_id, version='0.0.259'): + + ''' + + :param session_id: ecephys session_id + :param version: cache version + :return: session object (if found), units_table, trials table, + epoch information and session performance + ''' + + # to get current cache version + # npc_lims.get_current_cache_version() + try: + # Attempt to load data from cached files + trials = pd.read_parquet( + npc_lims.get_cache_path('trials', session_id, version=version) + ) + dprimes = np.array(pd.read_parquet( + npc_lims.get_cache_path('performance', session_id, version=version) + ).cross_modal_dprime.values) + epoch = pd.read_parquet( + npc_lims.get_cache_path('epochs', session_id, version=version) + ) + behavior_info = {'trials': trials, + 'dprime': dprimes, + 'is_good_behavior': np.count_nonzero(dprimes >= 1) >= 4, + 'epoch_info': epoch} + + units_table = pd.read_parquet( + npc_lims.get_cache_path('units', session_id, version=version) + ) + return None, units_table, behavior_info, None + + except FileNotFoundError: + # Attempt to load data from DynamicRoutingSession as a fallback + logger.warning(f"File not found for session_id {session_id}. Attempting fallback.") + return get_data_from_npc_sessions(session_id) + + except Exception as e: + raise FileNotFoundError(f"Unexpected error occurred: {e}") + + +def setup_units_table(run_params, units_table): + ''' + Returns the units_table with the column indicating QC + Filters the table for area specific runs + ''' + + units_table['good_unit'] = (units_table['isi_violations_ratio'] < run_params['unit_inclusion_criteria'][ + 'isi_violations']) & \ + (units_table['presence_ratio'] > run_params['unit_inclusion_criteria'][ + 'presence_ratio']) & \ + (units_table['amplitude_cutoff'] < run_params['unit_inclusion_criteria'][ + 'amplitude_cutoff']) & \ + (units_table['firing_rate'] > run_params['unit_inclusion_criteria']['firing_rate']) + + if run_params['run_on_qc_units']: + units_table = units_table[units_table.good_unit] + + areas_to_include = run_params.get('areas_to_include', []) + if areas_to_include: + units_table = units_table[units_table.structure.isin(areas_to_include)] + + areas_to_exclude = run_params.get('areas_to_exclude', []) + if areas_to_exclude: + units_table = units_table[~units_table.structure.isin(areas_to_exclude)] + + return units_table + + +def setup_trials_table(run_params, trials_table): + ''' + Returns trials table excluding aborted trials if running encoding on quiescent period + ''' + + # TO-DO: find out how? Find out what else to include in the trials table. + + return trials_table + + +def define_kernels(run_params): + ''' + Returns kernel info for input variables + ''' + + # Define master kernel list + master_kernels_list = { + 'intercept': {'function_call': 'intercept', 'type': 'discrete', 'length': 0, 'offset': 0, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, 'text': 'constant value'}, + 'vis1_vis': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'target stim in rewarded context'}, + 'sound1_vis': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'target stim in non-rewarded context'}, + 'vis2_vis': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'non-target stim in vis context'}, + 'sound2_vis': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'non-target stim in vis context'}, + 'vis1_aud': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'target stim in non-rewarded context'}, + 'sound1_aud': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'target stim in rewarded context'}, + 'vis2_aud': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'non-target stim in aud context'}, + 'sound2_aud': {'function_call': 'stimulus', 'type': 'discrete', 'length': 1, 'offset': 1, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'non-target stim in aud context'}, + 'nose': {'function_call': 'LP_features', 'type': 'continuous', 'length': 1, 'offset': -0.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'Z-scored Euclidean displacement of nose movements'}, + 'ears': {'function_call': 'LP_features', 'type': 'continuous', 'length': 1, 'offset': -0.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'Z-scored Euclidean displacement of ear movements'}, + 'jaw': {'function_call': 'LP_features', 'type': 'continuous', 'length': 1, 'offset': -0.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'Z-scored Euclidean displacement of jaw movements'}, + 'whisker_pad': {'function_call': 'LP_features', 'type': 'continuous', 'length': 1, 'offset': -0.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'Z-scored Euclidean displacement of whisker pad movements'}, + 'licks': {'function_call': 'licks', 'type': 'discrete', 'length': 1, 'offset': -0.5, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'lick responses'}, + 'running': {'function_call': 'running', 'type': 'continuous', 'length': 1, 'offset': -0.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, 'text': 'Z-scored running speed'}, + 'pupil': {'function_call': 'pupil', 'type': 'continuous', 'length': 1, 'offset': -0.5, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'Z-scored pupil diameter'}, + 'hit': {'function_call': 'choice', 'type': 'discrete', 'length': 3, 'offset': -1.5, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'lick to GO trial'}, + 'miss': {'function_call': 'choice', 'type': 'discrete', 'length': 3, 'offset': -1.5, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'no lick to GO trial'}, + 'correct_reject': {'function_call': 'choice', 'type': 'discrete', 'length': 3, 'offset': -1.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'no lick to NO-GO trial'}, + 'false_alarm': {'function_call': 'choice', 'type': 'discrete', 'length': 3, 'offset': -1.5, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, 'text': 'lick to NO-GO trial'}, + 'context': {'function_call': 'context', 'type': 'discrete', 'length': 0, 'offset': 0, 'orthogonalize': None, + 'num_weights': None, 'dropout': True, 'text': 'block-wise context'}, + 'session_time': {'function_call': 'session_time', 'type': 'continuous', 'length': 0, 'offset': 0, + 'orthogonalize': None, 'num_weights': None, 'dropout': True, + 'text': 'z-scored time in session'} + } + + # Define categories for input variables + categories = { + 'stimulus': ['vis1_vis', 'sound1_vis', 'vis2_vis', 'sound2_vis', 'vis1_aud', 'sound1_aud', 'vis2_aud', + 'sound2_aud'], + 'movements': ['ears', 'nose', 'jaw', 'whisker_pad', 'running', 'pupil', 'licks'], + 'movements_no_licks': ['ears', 'nose', 'jaw', 'whisker_pad', 'running', 'pupil'], + 'choice': ['hit', 'miss', 'correct_reject', 'false_alarm'], + 'LP_features': ['ears', 'nose', 'jaw', 'whisker_pad'] + } + + # Initialize selected keys list + selected_keys = [] + + # Determine selected input variables based on run_params + time_of_interest = run_params.get('time_of_interest', '') + input_variables = run_params.get('input_variables', []) + + # Choose input variables based on 'time_of_interest' + if not input_variables: + if 'trial' in time_of_interest or time_of_interest == 'full': + selected_keys = categories['stimulus'] + categories['movements'] + categories['choice'] + ['context', + 'session_time'] + elif 'quiescent' in time_of_interest: + selected_keys = categories['movements_no_licks'] + ['context', 'session_time'] + elif 'spontaneous' in time_of_interest: + selected_keys = categories['movements_no_licks'] + ['session_time'] + else: + # Extend selected_keys with input variables + for input_variable in input_variables: + selected_keys.extend(categories.get(input_variable, [input_variable])) + + # Add intercept if required + if run_params.get('intercept', False) and 'intercept' not in selected_keys: + selected_keys.append('intercept') + + # Log error if no input variables are selected + if not selected_keys: + raise ValueError("No input variables selected!") # raise value error . + + # remove drop variables if any + drop_keys = run_params.get('drop_variables', []) + if drop_keys and run_params['model_label'] != 'fullmodel': + for drop_key in drop_keys: + sub_keys = categories.get(drop_key, [drop_key]) + for sub_key in sub_keys: + print(get_timestamp() + f': dropping {sub_key}') + selected_keys.remove(sub_key) + + # Build kernels dictionary based on selected keys + kernels = {key: master_kernels_list[key] for key in selected_keys} + + # Update kernel lengths based on run_params + input_window_lengths = run_params.get('input_window_lengths', {}) + if input_window_lengths: + for key, length in input_window_lengths.items(): + if key in kernels: + kernels[key]['length'] = length + else: + raise KeyError(f"Key {key} not found in kernels.") + + # Update orthogonalization keys + input_ortho_keys = run_params.get('orthogonalize_against_context', []) + if input_ortho_keys: + ortho_keys = [] + for input_variable in input_ortho_keys: + ortho_keys.extend(categories.get(input_variable, [input_variable])) + for key in ortho_keys: + if key in kernels: + kernels[key]['orthogonalize'] = True + + return kernels + + +def get_spont_times(run_params, behavior_info): + ''' + Returns timestamps for spontaneous period based on time of interest. + ''' + def pick_values(start, stop, N, L): + iti = 5 # inter-trial interval + arr = np.arange(start, stop - L, iti + L) # Ensure end range does not exceed the stop + if len(arr) < N: # Handle edge case where N exceeds possible choices + logger.warning("Not enough intervals to pick from. Reducing number of snippets.") + N = len(arr) + picked_vals = np.zeros((N, 2)) + picked_vals[:, 0] = np.sort(np.random.choice(arr, size=N, replace=False)) + picked_vals[:, 1] = picked_vals[:, 0] + L + return picked_vals + + epoch = behavior_info['epoch_info'] + if 'Spontaneous' not in epoch.script_name.values: + logger.warning("No spontaneous activity recorded for this session.") + return np.empty((0, 2)) # Return an empty array if no spontaneous data exists + + start_times = epoch[epoch.script_name == 'Spontaneous'].start_time.values + stop_times = epoch[epoch.script_name == 'Spontaneous'].stop_time.values + num_snippets = 0 + L = 0 + + if 'full' in run_params['time_of_interest'] or run_params['time_of_interest'] == 'spontaneous': + return np.column_stack((start_times, stop_times)) + + elif 'trial' in run_params['time_of_interest']: + T = run_params['spontaneous_duration'] + L = run_params['trial_stop_time'] - run_params['trial_start_time'] + num_snippets = int(T // L) + + elif 'quiescent' in run_params['time_of_interest']: + T = run_params['spontaneous_duration'] + L = run_params['quiescent_stop_time'] - run_params['quiescent_start_time'] + num_snippets = int(T // L) + + intervals = [] + for i in range(len(start_times)): + snippets_per_epoch = num_snippets // len(start_times) + intervals.append( + pick_values(start_times[i], stop_times[i], snippets_per_epoch, L) + ) + + return np.vstack(intervals) + + +def establish_timebins(run_params, fit, behavior_info): + ''' + Returns the actual timestamps for each time bin that will be used in the regression model + ''' + + bin_starts = [] + epoch_trace = [] + if 'spontaneous' in run_params['time_of_interest'] or run_params['time_of_interest'] == 'full': + spont_times = get_spont_times(run_params, behavior_info) + for n in range(spont_times.shape[0]): + bin_edges = np.arange(spont_times[n, 0], spont_times[n, 1], run_params['spike_bin_width']) + bin_starts.append(bin_edges[:-1]) + epoch_trace.append([f'spontaneous{n}'] * len(bin_edges[:-1])) + + if 'full' in run_params['time_of_interest']: + if 'trial' in run_params['time_of_interest'] or run_params['time_of_interest'] == 'full': + start = behavior_info['trials'].start_time.values + stop = np.append(start[1:], behavior_info['trials'].stop_time.values[-1]) + for n in range(len(behavior_info['trials'])): + bin_edges = np.arange(start[n], stop[n], run_params['spike_bin_width']) + bin_starts.append(bin_edges[:-1]) + epoch_trace.append([f'trial{n}'] * len(bin_edges[:-1])) + + elif 'trial' in run_params['time_of_interest']: + start = behavior_info['trials'].stim_start_time.values + run_params['trial_start_time'] + stop = behavior_info['trials'].stim_start_time.values + run_params['trial_stop_time'] + for n in range(len(behavior_info['trials'])): + bin_edges = np.arange(start[n], stop[n], run_params['spike_bin_width']) + bin_starts.append(bin_edges[:-1]) + epoch_trace.append([f'trial{n}'] * len(bin_edges[:-1])) + + if 'quiescent' in run_params['time_of_interest']: + start = behavior_info['trials'].stim_start_time.values + run_params['quiescent_start_time'] + stop = behavior_info['trials'].stim_start_time.values + run_params['quiescent_stop_time'] + for n in range(len(behavior_info['trials'])): + bin_edges = np.arange(start[n], stop[n], run_params['spike_bin_width']) + bin_starts.append(bin_edges[:-1]) + epoch_trace.append([f'trial{n}'] * len(bin_edges[:-1])) + + epoch_trace = np.concatenate(epoch_trace) + bin_starts = np.concatenate(bin_starts) + + sorted_indices = np.argsort(bin_starts) + bin_starts = bin_starts[sorted_indices] + epoch_trace = epoch_trace[sorted_indices] + + bin_ends = bin_starts + run_params['spike_bin_width'] + timebins = np.vstack([bin_starts, bin_ends]).T + + fit['spike_bin_width'] = run_params['spike_bin_width'] + fit['timebins'] = timebins + fit['bin_centers'] = bin_starts + run_params['spike_bin_width'] / 2 + fit['epoch_trace'] = epoch_trace + + # Extend time bins to include trace around existing time bins for time-embedding, to create a fuller trace. + scale_factor = int(1 / run_params['spike_bin_width']) + result = next((x for x in range(2 * scale_factor, 6 * scale_factor) if x % 1 == 0), None) + r = result / scale_factor if result is not None else None + + bin_starts_all = [] + epoch_trace_all = [] + for epoch in np.unique(epoch_trace): + # Extend start by `r` + bins = np.arange(bin_starts[epoch_trace == epoch][0] - r, + bin_ends[epoch_trace == epoch][-1] + r, + run_params['spike_bin_width']) + bin_starts_all.append(bins) + epoch_trace_all.append([epoch]*len(bins)) + bin_starts_all = np.concatenate(bin_starts_all, axis=0) + epoch_trace_all = np.concatenate(epoch_trace_all) + + sorted_indices = np.argsort(bin_starts_all) + bin_starts_all = bin_starts_all[sorted_indices] + epoch_trace_all = epoch_trace_all[sorted_indices] + + bin_ends_all = bin_starts_all + run_params['spike_bin_width'] + timebins_all = np.vstack([bin_starts_all, bin_ends_all]).T + + fit['timebins_all'] = timebins_all + fit['bin_centers_all'] = bin_starts_all + run_params['spike_bin_width'] / 2 + fit['epoch_trace_all'] = epoch_trace_all + precision = 5 + rounded_times = np.round(timebins[:, 0], precision) + fit['mask'] = np.array([index for index, value in enumerate(timebins_all[:, 0]) if np.round(value, precision) in rounded_times]) + + assert len(fit['mask']) == timebins.shape[0], 'Incorrect masking, recheck timebins.' + # potentially a precision problem + + return fit + + +def get_spike_counts(spike_times, timebins): + ''' + spike_times, a list of spike times, sorted + timebins, numpy array of bins X start/stop + timebins[i,0] is the start of bin i + timbins[i,1] is the end of bin i + ''' + + counts = np.zeros([np.shape(timebins)[0]]) + spike_pointer = 0 + bin_pointer = 0 + while (spike_pointer < len(spike_times)) & (bin_pointer < np.shape(timebins)[0]): + if spike_times[spike_pointer] < timebins[bin_pointer, 0]: + # This spike happens before the time bin, advance spike + spike_pointer += 1 + elif spike_times[spike_pointer] >= timebins[bin_pointer, 1]: + # This spike happens after the time bin, advance time bin + bin_pointer += 1 + else: + counts[bin_pointer] += 1 + spike_pointer += 1 + + return counts + + +def process_spikes(units_table, run_params, fit): + ''' + Returns a dictionary including spike counts and unit-specific information. + ''' + + # identifies good units + units_table = setup_units_table(run_params, units_table) + + spikes = np.zeros((fit['timebins'].shape[0], len(units_table))) + + for uu, (_, unit) in tqdm(enumerate(units_table.iterrows()), total=len(units_table), desc='getting spike counts'): + spikes[:, uu] = get_spike_counts(np.array(unit['spike_times']), fit['timebins']) + + spike_count_arr = { + 'spike_counts': spikes, + 'bin_centers': fit['bin_centers'], + 'unit_id': units_table.unit_id.values, + 'structure': units_table.structure.values, + 'location': units_table.location.values, + 'quality': units_table.good_unit.values + } + fit['spike_count_arr'] = spike_count_arr + + # Check to make sure there are no NaNs in the fit_trace + assert np.isnan(fit['spike_count_arr']['spike_counts']).sum() == 0, "Have NaNs in spike_count_arr" + + return fit + + +def extract_unit_data(run_params, units_table, behavior_info): + ''' + Creates the fit dictionary + establishes time bins + processes spike times into spike counts for each time bin + ''' + + fit = dict() + fit = establish_timebins(run_params, fit, behavior_info) + fit = process_spikes(units_table, run_params, fit) + + return fit + + +def add_kernels(design, run_params, session, fit, behavior_info): + ''' + Iterates through the kernels in run_params['kernels'] and adds + each to the design matrix + Each kernel must have fields: + offset: + length: + + design the design matrix for this model + run_params the run_json for this model + session the SDK session object for this experiment + fit the fit object for this model + ''' + + run_params['kernels'] = define_kernels(run_params) + fit['failed_kernels'] = set() + fit['kernel_error_dict'] = dict() + + if session is None: + session = npc_sessions.DynamicRoutingSession(run_params["session_id"]) + + for kernel_name in run_params['kernels']: + if 'num_weights' not in run_params['kernels'][kernel_name]: + run_params['kernels'][kernel_name]['num_weights'] = None + design, fit = add_kernel_by_label(kernel_name, design, run_params, session, fit, behavior_info) + + return design, fit + + +def add_kernel_by_label(kernel_name, design, run_params, session, fit, behavior_info): + ''' + Adds the kernel specified by to the design matrix + kernel_name the label for this kernel, will raise an error if not implemented + design the design matrix for this model + run_params the run_json for this model + session the session object for this experiment + fit the fit object for this model + ''' + + print(get_timestamp() + ' Adding kernel: ' + kernel_name) + + try: + kernel_function = globals().get(run_params['kernels'][kernel_name]['function_call']) + if not callable(kernel_function): + raise ValueError(f"Invalid kernel name: {kernel_name}") + input_x = kernel_function(kernel_name, session, fit, behavior_info) + + if run_params['kernels'][kernel_name]['type'] == 'continuous': + input_x = standardize_inputs(input_x) + + if run_params['kernels'][kernel_name]['orthogonalize']: + context_kernel = context('context', session, fit, behavior_info) \ + if 'context' not in design.events.keys() else design.events['context'] + input_x = orthogonalize_this_kernel(input_x, context_kernel) + input_x = standardize_inputs(input_x) + + except Exception as e: + print(get_timestamp() + f"Exception: {e}") + print('Attempting to continue without this kernel.') + + fit['failed_kernels'].add(kernel_name) + fit['kernel_error_dict'][kernel_name] = { + 'error_type': 'kernel', + 'kernel_name': kernel_name, + 'exception': e.args[0], + } + return design, fit + else: + design.add_kernel( + input_x, + run_params['kernels'][kernel_name]['length'], + kernel_name, + offset=run_params['kernels'][kernel_name]['offset'], + num_weights=run_params['kernels'][kernel_name]['num_weights'] + ) + return design, fit + + +def intercept(kernel_name, session, fit, behavior_info): + return np.ones(len(fit['bin_centers_all'])) + + +def context(kernel_name, session, fit, behavior_info): + this_kernel = np.zeros(len(fit['bin_centers_all'])) + epoch_trace = fit['epoch_trace_all'] + + for n, epoch in enumerate(epoch_trace): + if 'trial' in epoch: + trial_no = int(''.join(filter(str.isdigit, epoch))) + this_kernel[n] = 1 if behavior_info['trials'].loc[trial_no, 'is_vis_context'] else -1 + + return this_kernel + + +def pupil(kernel_name, session, fit, behavior_info): + def process_pupil_data(df, behavior_info): + for pos, row in behavior_info['epoch_info'].iterrows(): + # Select rows within the current epoch + epoch_mask = (df.timestamps >= row.start_time) & (df.timestamps < row.stop_time) + epoch_df = df.loc[epoch_mask] + + # Compute the threshold for the current epoch + threshold = np.nanmean(epoch_df['pupil_area']) + 3 * np.nanstd(epoch_df['pupil_area']) + + # Apply threshold and set outliers to NaN within the epoch + df.loc[epoch_mask & (df['pupil_area'] > threshold), 'pupil_area'] = np.nan + df['pupil_area'] = df['pupil_area'].interpolate(method='linear') + return df + + df = process_pupil_data(session._eye_tracking.to_dataframe(), behavior_info) + return bin_timeseries(df.pupil_area.values, df.timestamps.values, fit['timebins_all']) + + +def running(kernel_name, session, fit, behavior_info): + return bin_timeseries(session._running_speed.data, session._running_speed.timestamps, fit['timebins_all']) + + +def licks(kernel_name, session, fit, behavior_info): + lick_times = session._all_licks[0].timestamps + + # Extract the bin edges + bin_starts, bin_stops = fit['timebins_all'][:, 0], fit['timebins_all'][:, 1] + + # Check if any lick times are within each bin + in_bin = (lick_times[:, None] >= bin_starts) & (lick_times[:, None] < bin_stops) + this_kernel = np.any(in_bin, axis=0).astype(int) + + return this_kernel + + +def LP_features(kernel_name, session, fit, behavior_info): + def eu_dist_for_LP(x_c, y_c): + return np.sqrt(x_c ** 2 + y_c ** 2) + + def part_info_LP(part_name, df): + confidence = df[part_name + '_likelihood'].values.astype('float') + temp_norm = df[part_name + '_temporal_norm'].values.astype('float') + x_c = df[part_name + '_x'].values.astype('float') + y_c = df[part_name + '_y'].values.astype('float') + xy = eu_dist_for_LP(x_c, y_c) + + xy[(confidence < 0.98) | (temp_norm > np.nanmean(temp_norm) + 3 * np.nanstd(temp_norm))] = np.nan + xy = pd.Series(xy).interpolate(limit_direction='both').to_numpy() + return xy, confidence + + map_names = {'ears': 'ear_base_l', 'jaw': 'jaw', 'nose': 'nose_tip', 'whisker_pad': 'whisker_pad_l_side'} + try: + df = session._lp[0][:] + except IndexError: + raise IndexError(f'{session.id} is not a session with video.') + timestamps = df['timestamps'].values.astype('float') + lp_part_name = map_names[kernel_name] + part_xy, confidence = part_info_LP(lp_part_name, df) + return bin_timeseries(part_xy, timestamps, fit['timebins_all']) + + +def choice(kernel_name, session, fit, behavior_info): + bin_starts, bin_stops = fit['timebins_all'][:, 0], fit['timebins_all'][:, 1] + if behavior_info['trials']['is_' + kernel_name].any(): + choice_times = behavior_info['trials'][behavior_info['trials']['is_' + kernel_name]].stim_start_time.values + in_bin = (choice_times[:, None] >= bin_starts) & (choice_times[:, None] < bin_stops) + this_kernel = np.any(in_bin, axis=0).astype(int) + else: + raise ValueError(f"No trials with is_{kernel_name}") + return this_kernel + + +def stimulus(kernel_name, session, fit, behavior_info): + stim_name, context_name = kernel_name.split('_') + bin_starts, bin_stops = fit['timebins_all'][:, 0], fit['timebins_all'][:, 1] + filtered_trials = behavior_info['trials'][ + (behavior_info['trials'].stim_name == stim_name) & (behavior_info['trials'].context_name == context_name)] + if not filtered_trials.empty: + stim_times = filtered_trials.stim_start_time.values + in_bin = (stim_times[:, None] >= bin_starts) & (stim_times[:, None] < bin_stops) + this_kernel = np.any(in_bin, axis=0).astype(int) + else: + raise ValueError(f"No trials presented with {stim_name} stimulus in {context_name} context") + return this_kernel + + +def session_time(kernel_name, session, fit, behavior_info): + return fit['bin_centers_all'] + + +# def toeplitz_this_kernel(this_kernel, kernel_length_samples, offset_samples, epoch_trace): +# top_kernel = np.zeros((len(epoch_trace), kernel_length_samples)) + np.nan +# unique_epochs = np.array([epoch_trace[i] for i in sorted(np.unique(epoch_trace, return_index=True)[1])]) +# for epoch in unique_epochs: +# ind = np.where(epoch_trace == epoch)[0] +# top_kernel[ind] = toeplitz(this_kernel[ind], kernel_length_samples, offset_samples).T +# return top_kernel.T + + +def toeplitz_this_kernel(input_x, kernel_length_samples, offset_samples): + ''' + Build a toeplitz matrix aligned to events. + + Args: + events (np.array of 1/0): Array with 1 if the event happened at that time, and 0 otherwise. + kernel_length_samples (int): How many kernel parameters + Returns + np.array, size(len(events), kernel_length_samples) of 1/0 + ''' + + total_len = len(input_x) + events = np.concatenate([input_x, np.zeros(kernel_length_samples)]) + arrays_list = [events] + for i in range(kernel_length_samples - 1): + arrays_list.append(np.roll(events, i + 1)) + this_kernel = np.vstack(arrays_list) + + # pad with zeros, roll offset_samples, and truncate to length + if offset_samples < 0: + this_kernel = np.concatenate([np.zeros((this_kernel.shape[0], np.abs(offset_samples))), this_kernel], axis=1) + this_kernel = np.roll(this_kernel, offset_samples)[:, np.abs(offset_samples):] + elif offset_samples > 0: + this_kernel = np.concatenate([this_kernel, np.zeros((this_kernel.shape[0], offset_samples))], axis=1) + this_kernel = np.roll(this_kernel, offset_samples)[:, :-offset_samples] + return this_kernel[:, :total_len] + + +class DesignMatrix: + def __init__(self, fit): + ''' + A toeplitz-matrix builder for running regression with multiple temporal kernels. + + Args + fit_dict, a dictionary with: + event_timestamps: The actual timestamps for each time bin that will be used in the regression model. + ''' + self.X = None + self.kernel_dict = {} + self.running_stop = 0 + self.events = {'timestamps': fit['bin_centers']} + self.spike_bin_width = fit['spike_bin_width'] + self.epochs = fit['epoch_trace'] + self.mask = fit['mask'] + + def make_labels(self, label, num_weights, offset, length): + base = [label] * num_weights + numbers = [str(x) for x in np.array(range(0, length)) + offset] + return [x[0] + '_' + x[1] for x in zip(base, numbers)] + + def add_kernel(self, input_x, kernel_length, label, offset=0, num_weights=None): + ''' + Add a temporal kernel. + + Args: + input_x (np.array): input data points + kernel_length (int): length of the kernel (in SECONDS). + label (string): Name of the kernel. + offset (int) :offset relative to the events. Negative offsets cause the kernel + to overhang before the event (in SECONDS) + ''' + + # Enforce unique labels + if label in self.kernel_dict.keys(): + raise ValueError('Labels must be unique') + + self.events[label] = input_x[self.mask] + + # CONVERT kernel_length to kernel_length_samples + if num_weights is None: + if kernel_length == 0: + kernel_length_samples = 1 + else: + kernel_length_samples = int(np.ceil((1 / self.spike_bin_width) * kernel_length)) + else: + # Some kernels are hard-coded by number of weights + kernel_length_samples = num_weights + + # CONVERT offset to offset_samples + offset_samples = int(np.floor((1 / self.spike_bin_width) * offset)) + + if np.abs(offset_samples) > 0: + this_kernel = toeplitz_this_kernel(input_x, kernel_length_samples, offset_samples) + else: + this_kernel = input_x.reshape(1, -1) + + # keep only the relevant trace for prediction + this_kernel = this_kernel[:, self.mask] + + self.kernel_dict[label] = { + 'kernel': this_kernel, + 'kernel_length_samples': kernel_length_samples, + 'offset_samples': offset_samples, + 'kernel_length_seconds': kernel_length, + 'offset_seconds': offset, + 'ind_start': self.running_stop, + 'ind_stop': self.running_stop + kernel_length_samples + } + self.running_stop += kernel_length_samples + + def get_X(self, kernels=None): + ''' + Get the design matrix. + Args: + kernels (optional list of kernel string names): which kernels to include (for model selection) + Returns: + X (xr.array): The design matrix + ''' + if kernels is None: + kernels = self.kernel_dict.keys() + + kernels_to_use = [] + param_labels = [] + for kernel_name in kernels: + kernels_to_use.append(self.kernel_dict[kernel_name]['kernel']) + param_labels.append(self.make_labels(kernel_name, + np.shape(self.kernel_dict[kernel_name]['kernel'])[0], + self.kernel_dict[kernel_name]['offset_samples'], + self.kernel_dict[kernel_name]['kernel_length_samples'])) + + X = np.vstack(kernels_to_use) + x_labels = np.hstack(param_labels) + + assert np.shape(X)[0] == np.shape(x_labels)[0], 'Weight Matrix must have the same length as the weight labels' + + X_array = xr.DataArray( + X, + dims=('weights', 'timestamps'), + coords={'weights': x_labels, + 'timestamps': self.events['timestamps']} + ) + self.X = X_array.T + return X_array.T + + +# def bin_timeseries2(x, x_timestamps, timebins): +# Tm = timebins.shape[0] +# start_indices = np.searchsorted(x_timestamps, timebins[:, 0], side='left') +# end_indices = np.searchsorted(x_timestamps, timebins[:, 1], side='right') +# +# binned = np.full(Tm, np.nan) +# +# for t_i in range(Tm): +# indices = slice(start_indices[t_i], end_indices[t_i]) +# if indices.start < indices.stop: # Check if the slice is non-empty +# binned[t_i] = np.nanmean(x[indices]) +# elif t_i > 0: # Propagate previous value +# binned[t_i] = binned[t_i - 1] +# return binned + + +def bin_timeseries(x, x_timestamps, timebins_all): + Tm = timebins_all.shape[0] + start_indices = np.searchsorted(x_timestamps, timebins_all[:, 0], side='left') + end_indices = np.searchsorted(x_timestamps, timebins_all[:, 1], side='right') + + binned = np.full(Tm, np.nan) + mask = [] + for t_i in range(Tm): + indices = slice(start_indices[t_i], end_indices[t_i]) + if indices.start < indices.stop: # Check if the slice is non-empty + binned[t_i] = np.nanmean(x[indices]) + elif t_i > 0: # Propagate previous value + binned[t_i] = binned[t_i - 1] + return binned + + +def get_timestamp(): + t = time.localtime() + return time.strftime('%Y-%m-%d: %H:%M:%S') + ' ' + + +def orthogonalize_this_kernel(this_kernel, y): + mat_to_ortho = np.concatenate((y.reshape(-1, 1), this_kernel.reshape(-1, 1)), axis=1) + print(get_timestamp() + ' : ' + 'othogonalizing against context') + Q, R = np.linalg.qr(mat_to_ortho) + return Q[:, 1] + + +def standardize_inputs(timeseries, mean_center=True, unit_variance=True, max_value=None): + ''' + Performs three different input standarizations to the timeseries + if mean_center, the timeseries is adjusted to have 0-mean. This can be performed with unit_variance. + if unit_variance, the timeseries is adjusted to have unit variance. This can be performed with mean_center. + if max_value is given, then the timeseries is normalized by max_value. This cannot be performed with mean_center and unit_variance. + ''' + if (max_value is not None) & (mean_center or unit_variance): + raise Exception( + 'Cannot perform max_value standardization and mean_center or unit_variance standardizations together.') + + if mean_center: + print(get_timestamp() + ' : ' + 'mean centering') + timeseries = timeseries - np.mean(timeseries) # mean center + if unit_variance: + print(get_timestamp() + ' : ' + 'standardized to unit variance') + timeseries = timeseries / np.std(timeseries) + if max_value is not None: + print(get_timestamp() + ' : ' + 'normalized by max value: ' + str(max_value)) + timeseries = timeseries / max_value + + return timeseries