Skip to content

Commit

Permalink
Clean up functions for creating paths CSV files.
Browse files Browse the repository at this point in the history
  • Loading branch information
aecelaya committed Sep 13, 2024
1 parent d9071f6 commit 2753293
Showing 1 changed file with 99 additions and 65 deletions.
164 changes: 99 additions & 65 deletions mist/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,6 @@ def get_float32_example_memory_size(

def set_warning_levels() -> None:
"""Set warning levels to ignore warnings."""
warnings.simplefilter(
action="ignore", category=np.VisibleDeprecationWarning
)
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
Expand Down Expand Up @@ -193,54 +190,67 @@ def get_files_df(path_to_dataset_json: str, train_or_test: str) -> pd.DataFrame:
"""Get dataframe with file paths for each patient in the dataset.
Args:
path_to_dataset_json: Path to dataset json file with the dataset
path_to_dataset_json: Path to dataset json file with dataset
information.
train_or_test: "train" or "test". If "train", the dataframe will have
columns for the mask and images. If "test", the dataframe will have
columns for the images.
columns for the mask and images. If "test", the dataframe
will have columns for the images.
Returns:
df: Dataframe with file paths for each patient in the dataset.
DataFrame with file paths for each patient in the dataset.
"""
# Read JSON file with dataset parameters.
dataset_information = read_json_file(path_to_dataset_json)
dataset_info = read_json_file(path_to_dataset_json)

# Get the names of the columns in the dataframe.
filename_dictionary = {}
# Determine columns based on the mode (train or test).
columns = ["id"]
if train_or_test == "train":
filename_dictionary["mask"] = dataset_information["mask"]

for key in dataset_information["images"].keys():
filename_dictionary[key] = dataset_information["images"][key]
columns.append("mask")
columns.extend(dataset_info["images"].keys())

dataframe_columns = ["id"] + list(filename_dictionary.keys())
paths_dataframe = pd.DataFrame(columns=dataframe_columns)
row_data_as_dictionary = dict.fromkeys(dataframe_columns)
# Base directory for the dataset.
base_dir = os.path.abspath(dataset_info[f"{train_or_test}-data"])

# Get the base directory for the dataset.
base_directory = os.path.abspath(
dataset_information[f"{train_or_test}-data"]
)
# Initialize an empty DataFrame with the determined columns.
df = pd.DataFrame(columns=columns)

# Get the list of patient IDs.
patient_ids = listdir_with_no_hidden_files(base_directory)
# Get list of patient IDs.
patient_ids = listdir_with_no_hidden_files(base_dir)

# Iterate over each patient and get the file paths for each patient.
for patient_id in patient_ids:
row_data_as_dictionary["id"] = patient_id
path_to_patient_data = os.path.join(base_directory, patient_id)
patient_files = get_files_list(path_to_patient_data)

for file in patient_files:
for image_type in filename_dictionary:
for image_identifying_string in filename_dictionary[image_type]:
if image_identifying_string in file:
row_data_as_dictionary[image_type] = file

paths_dataframe = pd.concat(
[paths_dataframe, pd.DataFrame(row_data_as_dictionary, index=[0])],
# Initialize row data with 'id' and empty values for other columns.
row_data = {"id": patient_id}

# Path to patient data.
patient_dir = os.path.join(base_dir, patient_id)
patient_files = get_files_list(patient_dir)

# Map file paths to their respective columns.
for image_type, identifying_strings in dataset_info["images"].items():
matching_file = next(
(file for file in patient_files
if any(s in file for s in identifying_strings)), None
)
if matching_file:
row_data[image_type] = matching_file

# Add the mask file if in training mode.
if train_or_test == "train":
mask_file = next(
(file for file in patient_files
if any(s in file for s in dataset_info["mask"])), None
)
if mask_file:
row_data["mask"] = mask_file

# Append the row to the DataFrame.
df = pd.concat(
[df, pd.DataFrame([row_data], columns=columns)],
ignore_index=True
)
return paths_dataframe

return df


def add_folds_to_df(df, n_splits=5):
Expand All @@ -256,46 +266,70 @@ def add_folds_to_df(df, n_splits=5):
patient ID is the fold that the patient belongs to the test set for
that given fold.
"""
# Get folds for k-fold cross validation.
kfold = KFold(
n_splits=n_splits,
shuffle=True,
random_state=42
)
# Initialize KFold object
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

splits = kfold.split(list(range(len(df))))
# Initialize an empty 'folds' column.
df.insert(loc=1, column="fold", value=[None] * len(df))

# Extract folds so that users can specify folds to train on.
test_splits = []
for split in splits:
test_splits.append(split[1])
# Assign fold numbers
for fold_number, (_, test_indices) in enumerate(kfold.split(df)):
df.loc[test_indices, "fold"] = fold_number

folds = {}
for i in range(n_splits):
for j in range(len(df)):
if j in test_splits[i]:
folds[j] = i
# Sort the dataframe by the 'fold' column
df = df.sort_values("fold").reset_index(drop=True)

folds = pd.Series(data=folds, index=list(folds.keys()), name="fold")
df.insert(loc=1, column="fold", value=folds)
df = df.sort_values("fold", ignore_index=True)
return df


def convert_dict_to_df(patients):
def convert_dict_to_df(patients: Dict[str, Dict[str, str]]) -> pd.DataFrame:
"""Converts a nested dictionary of patient data into a pandas DataFrame.
Args:
patients: A dictionary where each key is a patient ID, and each value
is another dictionary containing image keys and corresponding values.
Returns:
df: A DataFrame with columns for patient IDs and image data.
Raises:
ValueError: If the input dictionary is empty or if any patient data keys
do not match the keys from the first patient.
"""
# Check if the patients dictionary is empty
if not patients:
raise ValueError(
"The 'patients' dictionary is empty. Cannot convert to DataFrame."
)

# Initialize columns with 'id'.
columns = ["id"]

ids = list(patients.keys())
image_keys = list(patients[ids[0]].keys())
columns += image_keys
# Get list of patient IDs
patient_ids = list(patients.keys())

# Get image keys from the first patient's data.
first_patient_images = set(patients[patient_ids[0]].keys())
columns.extend(first_patient_images)

# Create an empty DataFrame with the desired columns.
df = pd.DataFrame(columns=columns)

for i in range(len(patients)):
row_dict = {"id": ids[i]}
for image in image_keys:
row_dict[image] = patients[ids[i]][image]
df = pd.concat([df, pd.DataFrame(row_dict, index=[0])], ignore_index=True)
# Populate DataFrame with patient data.
for patient_id in patient_ids:
patient_data = {"id": patient_id}

# Check if the keys in the current patient's data match the expected
# keys.
current_keys = set(patient_data.keys())
if current_keys != set(columns):
raise ValueError(
f"Data keys for patient '{patient_id}' do not match expected"
" keys."
)

df = pd.concat([df, pd.DataFrame([patient_data])], ignore_index=True)

return df


Expand Down

0 comments on commit 2753293

Please sign in to comment.