Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: New Sampler Splitters #37

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions benchmark_utils/splitter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import numpy as np

from sklearn.utils import check_random_state
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GroupShuffleSplit
Expand Down Expand Up @@ -92,3 +95,67 @@ def split(self, dataset, df_meta=None):
dataset, idx=1, indices=i_test
)
yield X_train, X_test, y_train, y_test


class SamplerMetaSplitter(BaseCrossValidator):
def __init__(
self, base_splitter, fraction=None, n_splits=None, random_state=None
):

self.base_splitter = base_splitter
self.fraction = fraction
self.n_splits = n_splits
self.random_state = check_random_state(random_state)

# Validate input parameters
if self.fraction is not None and self.n_splits is not None:
raise ValueError(
"Specify either 'fraction' or 'n_splits', not both."
)
if self.fraction is None and self.n_splits is None:
raise ValueError(
"Either 'fraction' or 'n_splits' must be provided."
)
if self.fraction is not None:
if not (0 < self.fraction <= 1):
raise ValueError(
"'fraction' must be between 0 (exclusive)"
" and 1 (inclusive)."
)
if self.n_splits is not None:
if not isinstance(self.n_splits, int) or self.n_splits <= 0:
raise ValueError("'n_splits' must be a positive integer.")

def get_n_splits(self, dataset=None, df_meta=None):

total_splits = self.base_splitter.get_n_splits(dataset, df_meta)

if self.fraction is not None:
sampled_splits = max(
1, int(np.floor(total_splits * self.fraction))
)
return min(sampled_splits, total_splits)
else:
return min(self.n_splits, total_splits)

def split(self, dataset, df_meta=None):

# Generate all possible splits
all_splits = list(self.base_splitter.split(dataset, df_meta))

total_splits = len(all_splits)

if self.fraction is not None:
n_sample = max(1, int(np.floor(total_splits * self.fraction)))
n_sample = min(n_sample, total_splits)
else:
n_sample = min(self.n_splits, total_splits)

# Sample unique split indices without replacement
sampled_indices = self.random_state.choice(
total_splits, size=n_sample, replace=False
)

# Yield only the sampled splits
for idx in sampled_indices:
yield all_splits[idx]
29 changes: 26 additions & 3 deletions objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from benchmark_utils.splitter import IntraSessionSplitter
from benchmark_utils.splitter import InterSessionSplitter
from benchmark_utils.splitter import InterSubjectSplitter
from benchmark_utils.splitter import SamplerMetaSplitter


class Objective(BaseObjective):
Expand All @@ -32,9 +33,14 @@ class Objective(BaseObjective):
"evaluation_process": [
"intra_session",
"inter_sessions",
"sample_intra_session",
"sample_inter_sessions",
bruAristimunha marked this conversation as resolved.
Show resolved Hide resolved
"sample_inter_subjects",
"inter_subjects",
],
"n_folds": [5],
"seed": [2024],
"fraction": [0.1],
}

is_convex = False
Expand All @@ -61,10 +67,27 @@ def set_data(self, dataset, sfreq, paradigm_name, dataset_name):
self.cv = InterSessionSplitter()
elif self.evaluation_process == "inter_subjects":
self.cv = InterSubjectSplitter(n_folds=self.n_folds)
else:
raise ValueError(
f"unknown evaluation process '{self.evaluation_process}'"
elif self.evaluation_process == "sample_intra_session":
self.cv = SamplerMetaSplitter(
base_splitter=IntraSessionSplitter(n_folds=self.n_folds),
random_state=self.seed,
fraction=self.fraction,
)
elif self.evaluation_process == "sample_inter_session":
self.cv = SamplerMetaSplitter(
base_splitter=InterSessionSplitter(),
random_state=self.seed,
fraction=self.fraction,
)
elif self.evaluation_process == "sample_inter_subject":
self.cv = SamplerMetaSplitter(
base_splitter=InterSubjectSplitter(n_folds=self.n_folds),
random_state=self.seed,
fraction=self.fraction,
)
else:
raise ValueError(f"unknown evaluation process '"
f"{self.evaluation_process}'")

self.cv_metadata = dict(df_meta=dataset.get_metadata())
self.extra_info = dict(
Expand Down
Loading