From b41c5eec65dff50471ed15a89a6542a5f0dfe595 Mon Sep 17 00:00:00 2001 From: Steven Goldenberg Date: Mon, 18 Nov 2024 09:41:38 -0500 Subject: [PATCH] Ran black formatter Updates 30 files for formatting purposes. --- jlab_datascience_toolkit/analysis/__init__.py | 8 +- .../analysis/multiclass_analysis_v0.py | 53 ++++-- .../core/jdst_analysis.py | 9 +- .../core/jdst_data_parser.py | 12 +- .../core/jdst_data_prep.py | 10 +- jlab_datascience_toolkit/core/jdst_model.py | 11 +- jlab_datascience_toolkit/core/jdst_module.py | 26 +-- jlab_datascience_toolkit/core/jdst_trainer.py | 11 +- .../data_parser/__init__.py | 14 +- .../data_parser/numpy_parser.py | 125 +++++++------ .../data_parser/parser_to_dataframe.py | 77 ++++---- .../data_prep/__init__.py | 18 +- .../data_prep/numpy_minmax_scaler.py | 171 ++++++++++-------- .../data_prep/split_dataframe_v0.py | 53 +++--- jlab_datascience_toolkit/models/__init__.py | 8 +- .../models/keras_mlp_v0.py | 61 ++++--- jlab_datascience_toolkit/trainers/__init__.py | 8 +- .../trainers/keras_trainer_v0.py | 78 ++++---- jlab_datascience_toolkit/utils/io.py | 34 ++-- .../utils/parser_utilities.py | 53 +++--- .../utils/registration.py | 35 ++-- .../workflows/example_workflow_v0.py | 66 ++++--- setup.py | 12 +- utests/utest_csv_parser.py | 112 ++++++------ utests/utest_io_utils.py | 37 ++-- utests/utest_keras_mlp_v0.py | 24 ++- utests/utest_numpy_minmax_scaler.py | 63 ++++--- utests/utest_numpy_parser.py | 68 ++++--- utests/utest_pandas_standard_scaler.py | 87 ++++++--- utests/utest_split_dataframe_v0.py | 18 +- 30 files changed, 787 insertions(+), 575 deletions(-) diff --git a/jlab_datascience_toolkit/analysis/__init__.py b/jlab_datascience_toolkit/analysis/__init__.py index 7bf60b3..458ff62 100644 --- a/jlab_datascience_toolkit/analysis/__init__.py +++ b/jlab_datascience_toolkit/analysis/__init__.py @@ -1,6 +1,10 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules +from jlab_datascience_toolkit.utils.registration import ( + register, + make, + list_registered_modules, +) register( id="MultiClassClassificationAnalysis_v0", - entry_point="jlab_datascience_toolkit.analysis.multiclass_analysis_v0:Analysis" + entry_point="jlab_datascience_toolkit.analysis.multiclass_analysis_v0:Analysis", ) diff --git a/jlab_datascience_toolkit/analysis/multiclass_analysis_v0.py b/jlab_datascience_toolkit/analysis/multiclass_analysis_v0.py index dedc3d9..1028a65 100644 --- a/jlab_datascience_toolkit/analysis/multiclass_analysis_v0.py +++ b/jlab_datascience_toolkit/analysis/multiclass_analysis_v0.py @@ -8,24 +8,49 @@ class Analysis: def __init__(self, configs: dict): self.configs = configs - def run(self, y_true, y_pred, labels: np.ndarray = None, target_names: np.ndarray = None, sample_weight: np.ndarray = None, logdir: str = None) -> list: + def run( + self, + y_true, + y_pred, + labels: np.ndarray = None, + target_names: np.ndarray = None, + sample_weight: np.ndarray = None, + logdir: str = None, + ) -> list: ans = [] for submodule in self.configs["submodules"]: submodule_type = submodule["type"] submodule_configs = submodule.get("configs", {}) if submodule_type == "confusion_matrix": - cm = confusion_matrix(y_true, y_pred, labels=labels, sample_weight=sample_weight, **submodule_configs) + cm = confusion_matrix( + y_true, + y_pred, + labels=labels, + sample_weight=sample_weight, + **submodule_configs, + ) ans.append(cm) - if logdir: np.save(os.path.join(logdir, 'confusion_matrix.npy'), cm) + if logdir: + np.save(os.path.join(logdir, "confusion_matrix.npy"), cm) elif submodule_type == "accuracy_score": - acc = accuracy_score(y_true, y_pred, sample_weight=sample_weight, **submodule_configs) + acc = accuracy_score( + y_true, y_pred, sample_weight=sample_weight, **submodule_configs + ) ans.append(acc) - if logdir: np.save(os.path.join(logdir, 'accuracy_score.npy'), acc) - elif submodule_type == 'classification_report': - cr = classification_report(y_true, y_pred, labels=labels, target_names=target_names, sample_weight=sample_weight, **submodule_configs) + if logdir: + np.save(os.path.join(logdir, "accuracy_score.npy"), acc) + elif submodule_type == "classification_report": + cr = classification_report( + y_true, + y_pred, + labels=labels, + target_names=target_names, + sample_weight=sample_weight, + **submodule_configs, + ) ans.append(cr) if logdir and isinstance(cr, dict): - for metric in ['precision', 'recall', 'f1-score']: + for metric in ["precision", "recall", "f1-score"]: metric_list = [] for k, v in cr.items(): if isinstance(v, dict) and (metric in v.keys()): @@ -34,16 +59,18 @@ def run(self, y_true, y_pred, labels: np.ndarray = None, target_names: np.ndarra fig, ax = plt.subplots() ax.bar( [tup[0] for tup in metric_list], - [tup[1] for tup in metric_list] + [tup[1] for tup in metric_list], ) ax.set_title(metric) fig.tight_layout() fig.savefig( - os.path.join(logdir, f'{metric}.jpg'), + os.path.join(logdir, f"{metric}.jpg"), transparent=True, - dpi=300 + dpi=300, ) plt.close(fig=fig) else: - raise NameError('Unsupported submodule type in Multi-Class Analysis Module !') - return ans \ No newline at end of file + raise NameError( + "Unsupported submodule type in Multi-Class Analysis Module !" + ) + return ans diff --git a/jlab_datascience_toolkit/core/jdst_analysis.py b/jlab_datascience_toolkit/core/jdst_analysis.py index 6c611d0..0ab7a9a 100644 --- a/jlab_datascience_toolkit/core/jdst_analysis.py +++ b/jlab_datascience_toolkit/core/jdst_analysis.py @@ -1,12 +1,13 @@ from jlab_datascience_toolkit.core.jdst_module import JDSTModule from abc import ABC, abstractmethod -class JDSTAnalysis(JDSTModule,ABC): - ''' + +class JDSTAnalysis(JDSTModule, ABC): + """ Base class for the post-training analysis. This class inherits from the module base class. - ''' + """ # Run the analysis: @abstractmethod def run(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/core/jdst_data_parser.py b/jlab_datascience_toolkit/core/jdst_data_parser.py index 2e7ef1c..b6b07a5 100644 --- a/jlab_datascience_toolkit/core/jdst_data_parser.py +++ b/jlab_datascience_toolkit/core/jdst_data_parser.py @@ -1,17 +1,17 @@ from jlab_datascience_toolkit.core.jdst_module import JDSTModule from abc import ABC, abstractmethod -class JDSTDataParser(JDSTModule,ABC): - ''' +class JDSTDataParser(JDSTModule, ABC): + """ Base class for data parsing. This class inherits from the module base class. - ''' - + """ + # Load and save the data: @abstractmethod def load_data(self): raise NotImplementedError - + @abstractmethod def save_data(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/core/jdst_data_prep.py b/jlab_datascience_toolkit/core/jdst_data_prep.py index c23976b..adfd8ed 100644 --- a/jlab_datascience_toolkit/core/jdst_data_prep.py +++ b/jlab_datascience_toolkit/core/jdst_data_prep.py @@ -1,11 +1,11 @@ from jlab_datascience_toolkit.core.jdst_module import JDSTModule from abc import ABC, abstractmethod -class JDSTDataPrep(JDSTModule,ABC): - ''' +class JDSTDataPrep(JDSTModule, ABC): + """ Base class for data preparation. This class inherits from the module base class. - ''' + """ # Save the data, if required # This might be helpful, if the underlying data preperation is a computational intensive operation @@ -13,7 +13,7 @@ class JDSTDataPrep(JDSTModule,ABC): @abstractmethod def save_data(self): raise NotImplementedError - + # Run the data preparation: @abstractmethod def run(self): @@ -22,4 +22,4 @@ def run(self): # Reverse the data preparation (if possible): @abstractmethod def reverse(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/core/jdst_model.py b/jlab_datascience_toolkit/core/jdst_model.py index ca88d7f..ffdb0f3 100644 --- a/jlab_datascience_toolkit/core/jdst_model.py +++ b/jlab_datascience_toolkit/core/jdst_model.py @@ -1,12 +1,13 @@ from jlab_datascience_toolkit.core.jdst_module import JDSTModule from abc import ABC, abstractmethod -class JDSTModel(JDSTModule,ABC): - ''' + +class JDSTModel(JDSTModule, ABC): + """ Base class for the model. This class inherits from the module base class. - ''' - + """ + # Get a prediction: @abstractmethod def predict(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/core/jdst_module.py b/jlab_datascience_toolkit/core/jdst_module.py index 97eee29..31ca1c7 100644 --- a/jlab_datascience_toolkit/core/jdst_module.py +++ b/jlab_datascience_toolkit/core/jdst_module.py @@ -1,36 +1,36 @@ from abc import ABC, abstractmethod -class JDSTModule(ABC): - ''' - Base class for any module that is written for the JLab Data Science Toolkit. The functions defined here have to be implemented in +class JDSTModule(ABC): + """ + Base class for any module that is written for the JLab Data Science Toolkit. The functions defined here have to be implemented in any new module that is written - ''' - + """ + # Initialize: - def __init__(self,**kwargs): - self.module_name = "" # --> Define the name of the module + def __init__(self, **kwargs): + self.module_name = "" # --> Define the name of the module - # Get module info: Just briefly describe what this module is doing, + # Get module info: Just briefly describe what this module is doing, # what are the inputs and what is returned? @abstractmethod def get_info(self): raise NotImplementedError - + # Load and save configuration files which run the module: @abstractmethod def load_config(self): raise NotImplementedError - + @abstractmethod def save_config(self): raise NotImplementedError - + # Load and save for checkpointing (i.e. capture state of module) @abstractmethod def load(self): raise NotImplementedError - + @abstractmethod def save(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/core/jdst_trainer.py b/jlab_datascience_toolkit/core/jdst_trainer.py index 73db2c9..b5b95a8 100644 --- a/jlab_datascience_toolkit/core/jdst_trainer.py +++ b/jlab_datascience_toolkit/core/jdst_trainer.py @@ -1,12 +1,13 @@ from jlab_datascience_toolkit.core.jdst_module import JDSTModule from abc import ABC, abstractmethod -class JDSTTrainer(JDSTModule,ABC): - ''' + +class JDSTTrainer(JDSTModule, ABC): + """ Base class for the Trainer. This class inherits from the module base class. - ''' - + """ + # Get a prediction: @abstractmethod def fit(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/data_parser/__init__.py b/jlab_datascience_toolkit/data_parser/__init__.py index 4bb6bae..260e80b 100644 --- a/jlab_datascience_toolkit/data_parser/__init__.py +++ b/jlab_datascience_toolkit/data_parser/__init__.py @@ -1,14 +1,18 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules +from jlab_datascience_toolkit.utils.registration import ( + register, + make, + list_registered_modules, +) register( id="NumpyParser_v0", - entry_point="jlab_datascience_toolkit.data_parser.numpy_parser:NumpyParser" + entry_point="jlab_datascience_toolkit.data_parser.numpy_parser:NumpyParser", ) from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser register( - id='CSVParser_v0', + id="CSVParser_v0", entry_point="jlab_datascience_toolkit.data_parser.parser_to_dataframe:Parser2DataFrame", - kwargs={'registry_config': {'file_format': 'csv'}} -) \ No newline at end of file + kwargs={"registry_config": {"file_format": "csv"}}, +) diff --git a/jlab_datascience_toolkit/data_parser/numpy_parser.py b/jlab_datascience_toolkit/data_parser/numpy_parser.py index 9a0f718..ecaa948 100644 --- a/jlab_datascience_toolkit/data_parser/numpy_parser.py +++ b/jlab_datascience_toolkit/data_parser/numpy_parser.py @@ -4,6 +4,7 @@ import logging import inspect + class NumpyParser(JDSTDataParser): """Numpy data parser that reads in strings of file paths and returns a single .npy file @@ -12,7 +13,7 @@ class NumpyParser(JDSTDataParser): ii) Combine single .npy files into one Input(s): - i) Full path to .yaml configuration file + i) Full path to .yaml configuration file ii) Optional: User configuration, i.e. a python dict with additonal / alternative settings Output(s): @@ -20,104 +21,122 @@ class NumpyParser(JDSTDataParser): """ # Initialize: - #********************************************* - def __init__(self,path_to_cfg,user_config={}): + # ********************************************* + def __init__(self, path_to_cfg, user_config={}): # Set the name specific to this module: self.module_name = "numpy_parser" # Load the configuration: - self.config = self.load_config(path_to_cfg,user_config) - + self.config = self.load_config(path_to_cfg, user_config) + # Save this config, if a path is provided: - if 'store_cfg_loc' in self.config: - self.save_config(self.config['store_cfg_loc']) + if "store_cfg_loc" in self.config: + self.save_config(self.config["store_cfg_loc"]) # Run sanity check(s): # i) Make sure that the provide data path(s) are list objects: - if isinstance(self.config['data_loc'],list) == False: - logging.error(">>> " + self.module_name +": The data path(s) must be a list object, e.g. data_loc: [path1,path2,...] <<<") - #********************************************* + if isinstance(self.config["data_loc"], list) == False: + logging.error( + ">>> " + + self.module_name + + ": The data path(s) must be a list object, e.g. data_loc: [path1,path2,...] <<<" + ) + + # ********************************************* # Provide information about this module: - #********************************************* + # ********************************************* def get_info(self): print(inspect.getdoc(self)) - #********************************************* + + # ********************************************* # Handle configurations: - #********************************************* + # ********************************************* # Load the config: - def load_config(self,path_to_cfg,user_config): - with open(path_to_cfg, 'r') as file: + def load_config(self, path_to_cfg, user_config): + with open(path_to_cfg, "r") as file: cfg = yaml.safe_load(file) - + # Overwrite config with user settings, if provided try: if bool(user_config): - #++++++++++++++++++++++++ - for key in user_config: - cfg[key] = user_config[key] - #++++++++++++++++++++++++ + # ++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + # ++++++++++++++++++++++++ except: - logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + logging.exception( + ">>> " + + self.module_name + + ": Invalid user config. Please make sure that a dictionary is provided <<<" + ) return cfg - - #----------------------------- + + # ----------------------------- # Store the config: - def save_config(self,path_to_config): - with open(path_to_config, 'w') as file: - yaml.dump(self.config, file) - #********************************************* - + def save_config(self, path_to_config): + with open(path_to_config, "w") as file: + yaml.dump(self.config, file) + + # ********************************************* + # Load .npy file(s): - #********************************************* + # ********************************************* # Load a single file: - def load_single_file(self,path_to_file): + def load_single_file(self, path_to_file): try: - return np.load(path_to_file).astype(self.config['dtype']) + return np.load(path_to_file).astype(self.config["dtype"]) except: logging.exception(">>> " + self.module_name + ": File does not exist! <<<") - #----------------------------- + # ----------------------------- # Load multiple files which represent the final data: def load_data(self): try: collected_data = [] - #+++++++++++++++++++++ - for path in self.config['data_loc']: + # +++++++++++++++++++++ + for path in self.config["data_loc"]: collected_data.append(self.load_single_file(path)) - #+++++++++++++++++++++ + # +++++++++++++++++++++ - return np.concatenate(collected_data,axis=self.config['event_axis']) + return np.concatenate(collected_data, axis=self.config["event_axis"]) except: - logging.exception(">>> " + self.module_name + ": Please check the provided data path which must be a list. <<<") - #********************************************* - + logging.exception( + ">>> " + + self.module_name + + ": Please check the provided data path which must be a list. <<<" + ) + + # ********************************************* + # Save the data: - #********************************************* - def save_data(self,data): + # ********************************************* + def save_data(self, data): try: - os.makedirs(self.config['data_store_loc'],exist_ok=True) - np.save(self.config['data_store_loc'],data) + os.makedirs(self.config["data_store_loc"], exist_ok=True) + np.save(self.config["data_store_loc"], data) except: - logging.exception(">>> " + self.module_name + ": Please provide a valid name for storing the data in .npy format. <<<") - #********************************************* - - # Module checkpointing: Not implemented yet and maybe not + logging.exception( + ">>> " + + self.module_name + + ": Please provide a valid name for storing the data in .npy format. <<<" + ) + + # ********************************************* + + # Module checkpointing: Not implemented yet and maybe not # necessary, ao we leave these functions blank for now - #********************************************* + # ********************************************* def load(self): return 0 - #----------------------------- + # ----------------------------- def save(self): return 0 - #********************************************* - - - + # ********************************************* diff --git a/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py index 71b24d4..0e83e3a 100644 --- a/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py +++ b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py @@ -8,25 +8,23 @@ import os from typing import Union -parser_log = logging.getLogger('Parser Logger') +parser_log = logging.getLogger("Parser Logger") # Supported file formats pandas_read_functions = dict( - csv=pd.read_csv, - feather=pd.read_feather, - json=pd.read_json, - pickle=pd.read_pickle + csv=pd.read_csv, feather=pd.read_feather, json=pd.read_json, pickle=pd.read_pickle ) + class Parser2DataFrame(JDSTDataParser): """Reads a list of files and concatenates them in a Pandas DataFrame. - Intialization arguments: + Intialization arguments: `config: dict` - Optional configuration keys: + Optional configuration keys: `filepaths: str | list[str]` - Paths to files the module should parse. Defaults to `[]` which + Paths to files the module should parse. Defaults to `[]` which produces a warning when load_data() is called. `file_format: str = 'csv', Format of files to parse. Currently supports csv, feather, json @@ -73,49 +71,51 @@ def __init__(self, config: dict = None, registry_config: dict = None): # Set default config self.config = dict( - filepaths=[], - file_format='csv', - read_kwargs = {}, - concat_kwargs = {}, + filepaths=[], + file_format="csv", + read_kwargs={}, + concat_kwargs={}, ) # First update defaults with registry_configuration if registry_config is not None: - parser_log.debug(f'Updating defaults with: {registry_config}') + parser_log.debug(f"Updating defaults with: {registry_config}") self.config.update(registry_config) # Now update configuration with new (user) configuration if config is not None: - parser_log.debug(f'Updating registered config with: {config}') + parser_log.debug(f"Updating registered config with: {config}") self.config.update(config) # To handle strings and lists of strings, we convert the former here - if isinstance(self.config['filepaths'], str): - self.config['filepaths'] = [self.config['filepaths']] + if isinstance(self.config["filepaths"], str): + self.config["filepaths"] = [self.config["filepaths"]] self.setup() @property def name(self): - return 'Parser2DataFrame_v0' + return "Parser2DataFrame_v0" def setup(self): # Set the correct reading function here self.read_function = pandas_read_functions.get( - self.config['file_format'].lower(), None) + self.config["file_format"].lower(), None + ) if self.read_function is None: parser_log.error( - f'File format {self.config["file_format"]}' - 'is not currently supported.') + f'File format {self.config["file_format"]}' + "is not currently supported." + ) raise ValueError def get_info(self): - """ Prints the docstring for the Parser2DataFrame module""" + """Prints the docstring for the Parser2DataFrame module""" print(inspect.getdoc(self)) def load(self, path: str): - """ Load the entire module state from `path` + """Load the entire module state from `path` Args: path (str): Path to folder containing module files. @@ -134,7 +134,7 @@ def save(self, path: str): self.save_config(save_dir) def load_data(self) -> pd.DataFrame: - """ Loads all files listed in `config['filepaths']` + """Loads all files listed in `config['filepaths']` read_kwargs are passed to the appropriate pd.read_{file_format} function concat_kwargs are passed to pd.concat() after all files are read @@ -142,25 +142,22 @@ def load_data(self) -> pd.DataFrame: pd.DataFrame: A single DataFrame containing concatenated data """ data_list = [] - for file in self.config['filepaths']: - parser_log.debug(f'Loading {file} ...') - data = self.read_function( - file, - **self.config['read_kwargs']) + for file in self.config["filepaths"]: + parser_log.debug(f"Loading {file} ...") + data = self.read_function(file, **self.config["read_kwargs"]) data_list.append(data) # Check for empty data and return nothing if empty if not data_list: parser_log.warning( - 'load_data() returning None. This is probably not what you ' - 'wanted. Ensure that your configuration includes the key ' - '"filepaths"') - return - - output = pd.concat( - data_list, - **self.config['concat_kwargs']) - + "load_data() returning None. This is probably not what you " + "wanted. Ensure that your configuration includes the key " + '"filepaths"' + ) + return + + output = pd.concat(data_list, **self.config["concat_kwargs"]) + return output def load_config(self, path: Union[Path, str]): @@ -168,14 +165,14 @@ def load_config(self, path: Union[Path, str]): self.setup() def save_config(self, path: Union[Path, str], overwrite=False): - """ Saves this modules configuration to the file specified by path + """Saves this modules configuration to the file specified by path If path is a directory, we save the configuration as config.yaml Args: - path (Path | str): Location for saved configuration. Either a filename or directory is + path (Path | str): Location for saved configuration. Either a filename or directory is acceptable. """ save_yaml_config(self.config, path, overwrite) - + def save_data(self): return super().save_data() diff --git a/jlab_datascience_toolkit/data_prep/__init__.py b/jlab_datascience_toolkit/data_prep/__init__.py index 40dab1b..0bfd0bb 100644 --- a/jlab_datascience_toolkit/data_prep/__init__.py +++ b/jlab_datascience_toolkit/data_prep/__init__.py @@ -1,18 +1,22 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules +from jlab_datascience_toolkit.utils.registration import ( + register, + make, + list_registered_modules, +) register( id="NumpyMinMaxScaler_v0", - entry_point="jlab_datascience_toolkit.data_prep.numpy_minmax_scaler:NumpyMinMaxScaler" + entry_point="jlab_datascience_toolkit.data_prep.numpy_minmax_scaler:NumpyMinMaxScaler", ) from jlab_datascience_toolkit.data_prep.numpy_minmax_scaler import NumpyMinMaxScaler register( - id = "PandasStandardScaler_v0", - entry_point="jlab_datascience_toolkit.data_prep.pandas_standard_scaler:PandasStandardScaler" + id="PandasStandardScaler_v0", + entry_point="jlab_datascience_toolkit.data_prep.pandas_standard_scaler:PandasStandardScaler", ) register( - id = "SplitDataFrame_v0", - entry_point="jlab_datascience_toolkit.data_prep.split_dataframe_v0:SplitDataFrame" -) \ No newline at end of file + id="SplitDataFrame_v0", + entry_point="jlab_datascience_toolkit.data_prep.split_dataframe_v0:SplitDataFrame", +) diff --git a/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py b/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py index f2995d9..a52815b 100644 --- a/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py +++ b/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py @@ -5,36 +5,44 @@ import yaml import os + class NumpyMinMaxScaler(JDSTDataPrep): # Initialize: - #********************************************* - def __init__(self,path_to_cfg,user_config={}): + # ********************************************* + def __init__(self, path_to_cfg, user_config={}): # Set the name specific to this module: self.module_name = "numpy_minmax_scaler" - + # Load the configuration: - self.config = self.load_config(path_to_cfg,user_config) - + self.config = self.load_config(path_to_cfg, user_config) + # Save this config, if a path is provided: - if 'store_cfg_loc' in self.config: - self.save_config(self.config['store_cfg_loc']) + if "store_cfg_loc" in self.config: + self.save_config(self.config["store_cfg_loc"]) # Set up the scaler: try: - self.scaler = MinMaxScaler(self.config['feature_range']) + self.scaler = MinMaxScaler(self.config["feature_range"]) except: - logging.exception(">>> " + self.module_name + f": Invalid feature range: {self.config['feature_range']}. Must provide a tuple. <<<") - #********************************************* + logging.exception( + ">>> " + + self.module_name + + f": Invalid feature range: {self.config['feature_range']}. Must provide a tuple. <<<" + ) + + # ********************************************* # Provide information about this module: - #********************************************* + # ********************************************* def get_info(self): print(" ") print("*** Info: NumpyMinMaxScaler ***") print("Input(s):") - print("i) Full path to .yaml configuration file ") - print("ii) Optional: User configuration, i.e. a python dict with additonal / alternative settings") + print("i) Full path to .yaml configuration file ") + print( + "ii) Optional: User configuration, i.e. a python dict with additonal / alternative settings" + ) print("iii) Numpy data") print("What this module does:") print("i) Scale input data with respect to a specified range") @@ -43,107 +51,120 @@ def get_info(self): print("i) Scaled .npy data") print("ii) Optional: unscaled .npy data") print("Note(s):") - print("i) The scaler will (by default) be fitted to the data and the transform it. To disable the fitting, do: run(data,disable_fit=True)") + print( + "i) The scaler will (by default) be fitted to the data and the transform it. To disable the fitting, do: run(data,disable_fit=True)" + ) print("*** Info: NumpyMinMaxScaler ***") print(" ") - #********************************************* + + # ********************************************* # Handle configurations: - #********************************************* + # ********************************************* # Load the config: - def load_config(self,path_to_cfg,user_config): - with open(path_to_cfg, 'r') as file: + def load_config(self, path_to_cfg, user_config): + with open(path_to_cfg, "r") as file: cfg = yaml.safe_load(file) - + # Overwrite config with user settings, if provided try: if bool(user_config): - #++++++++++++++++++++++++ - for key in user_config: - cfg[key] = user_config[key] - #++++++++++++++++++++++++ + # ++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + # ++++++++++++++++++++++++ except: - logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + logging.exception( + ">>> " + + self.module_name + + ": Invalid user config. Please make sure that a dictionary is provided <<<" + ) return cfg - - #----------------------------- + + # ----------------------------- # Store the config: - def save_config(self,path_to_config): - with open(path_to_config, 'w') as file: - yaml.dump(self.config, file) - #********************************************* - + def save_config(self, path_to_config): + with open(path_to_config, "w") as file: + yaml.dump(self.config, file) + + # ********************************************* + # Run a type chec: - #********************************************* - def type_check(self,data): - if isinstance(data,np.ndarray) == False: + # ********************************************* + def type_check(self, data): + if isinstance(data, np.ndarray) == False: logging.error(">>> " + self.module_name + ": Data is not a numpy array <<<") return False - + return True - #********************************************* - + # ********************************************* - # Run and reverse the scaling: - #********************************************* + # Run and reverse the scaling: + # ********************************************* # Scale: - def run(self,data,disable_fit=False): + def run(self, data, disable_fit=False): # Check if the data-type is a numpy array: if self.type_check(data): - # Do not re-calibrate the scaler, if a fit has already been done: - if disable_fit == True: - return self.scaler.transform(data) + # Do not re-calibrate the scaler, if a fit has already been done: + if disable_fit == True: + return self.scaler.transform(data) - return self.scaler.fit_transform(data) - - #----------------------------- + return self.scaler.fit_transform(data) + + # ----------------------------- # Undo the scaling: - def reverse(self,data): + def reverse(self, data): # Run a type check: if self.type_check(data): - return self.scaler.inverse_transform(data) - #********************************************* + return self.scaler.inverse_transform(data) + + # ********************************************* # Save the data: - #********************************************* - def save_data(self,data): + # ********************************************* + def save_data(self, data): try: - os.makedirs(self.config['data_store_loc'],exist_ok=True) - np.save(self.config['data_store_loc'],data) + os.makedirs(self.config["data_store_loc"], exist_ok=True) + np.save(self.config["data_store_loc"], data) except: - logging.exception(">>> " + self.module_name + ": Please provide a valid name for storing the transformed .npy data <<<") - #********************************************* + logging.exception( + ">>> " + + self.module_name + + ": Please provide a valid name for storing the transformed .npy data <<<" + ) + + # ********************************************* # Module checkpointing: Save and load parameters that are important to this scaler: - #********************************************* + # ********************************************* def load(self): - store_name = self.config['store_loc'] - scaler_min = np.load(store_name+"/numpy_minmax_scaler_min.npy") - scaler_scale = np.load(store_name+"/numpy_minmax_scaler_scale.npy") - scaler_data_min = np.load(store_name+"/numpy_minmax_scaler_data_min.npy") - scaler_data_max = np.load(store_name+"/numpy_minmax_scaler_data_max.npy") + store_name = self.config["store_loc"] + scaler_min = np.load(store_name + "/numpy_minmax_scaler_min.npy") + scaler_scale = np.load(store_name + "/numpy_minmax_scaler_scale.npy") + scaler_data_min = np.load(store_name + "/numpy_minmax_scaler_data_min.npy") + scaler_data_max = np.load(store_name + "/numpy_minmax_scaler_data_max.npy") return { - 'min': scaler_min, - 'scale': scaler_scale, - 'data_min':scaler_data_min, - 'data_max':scaler_data_max + "min": scaler_min, + "scale": scaler_scale, + "data_min": scaler_data_min, + "data_max": scaler_data_max, } - #----------------------------- - + + # ----------------------------- + def save(self): - store_name = self.config['store_loc'] - os.makedirs(store_name,exist_ok=True) + store_name = self.config["store_loc"] + os.makedirs(store_name, exist_ok=True) - np.save(store_name+"/numpy_minmax_scaler_min.npy",self.scaler.min_) - np.save(store_name+"/numpy_minmax_scaler_scale.npy",self.scaler.scale_) - np.save(store_name+"/numpy_minmax_scaler_data_min.npy",self.scaler.data_min_) - np.save(store_name+"/numpy_minmax_scaler_data_max.npy",self.scaler.data_max_) - #********************************************* + np.save(store_name + "/numpy_minmax_scaler_min.npy", self.scaler.min_) + np.save(store_name + "/numpy_minmax_scaler_scale.npy", self.scaler.scale_) + np.save(store_name + "/numpy_minmax_scaler_data_min.npy", self.scaler.data_min_) + np.save(store_name + "/numpy_minmax_scaler_data_max.npy", self.scaler.data_max_) - \ No newline at end of file + # ********************************************* diff --git a/jlab_datascience_toolkit/data_prep/split_dataframe_v0.py b/jlab_datascience_toolkit/data_prep/split_dataframe_v0.py index 65ad53f..f3e9832 100644 --- a/jlab_datascience_toolkit/data_prep/split_dataframe_v0.py +++ b/jlab_datascience_toolkit/data_prep/split_dataframe_v0.py @@ -6,23 +6,28 @@ class SplitDataFrame(JDSTDataPrep): - ''' + """ Splits a given pandas DataFrame by columns (feature_columns & target_columns) and converts them to numpy arrays. Each array is then splitted by rows according to the given rows_fractions (which must add up to one). - ''' + """ + def __init__(self, configs: dict): self.configs = configs - self.feature_columns = configs.get("feature_columns", None) # If None, all columns are considered - self.target_columns = configs.get("target_columns", None) # If None, there will be no target array + self.feature_columns = configs.get( + "feature_columns", None + ) # If None, all columns are considered + self.target_columns = configs.get( + "target_columns", None + ) # If None, there will be no target array self.rows_fractions = configs.get("rows_fractions", [1.0]) self.random_state = configs.get("random_state", None) - assert sum(self.rows_fractions) == 1, 'Fractions must add up to 1 !!!' - + assert sum(self.rows_fractions) == 1, "Fractions must add up to 1 !!!" + @staticmethod def split_by_columns( df: pd.DataFrame, feature_columns: list[str] | str, - target_columns: list[str] | str + target_columns: list[str] | str, ) -> list[np.ndarray]: if feature_columns is None: x = df.to_numpy() @@ -32,9 +37,11 @@ def split_by_columns( y = df.loc[:, target_columns].to_numpy() return [x, y] return [x] - + @staticmethod - def split_array(arr: np.ndarray, idxs: np.ndarray, rows_fractions: list[float]) -> list[np.ndarray]: + def split_array( + arr: np.ndarray, idxs: np.ndarray, rows_fractions: list[float] + ) -> list[np.ndarray]: subarrays = [] start = 0 for i, fraction in enumerate(rows_fractions): @@ -42,38 +49,42 @@ def split_array(arr: np.ndarray, idxs: np.ndarray, rows_fractions: list[float]) end = len(idxs) else: end = start + int(fraction * len(idxs)) - assert end > start, f'Could not split array of shape {arr.shape} with fractions {rows_fractions} !!!' - sub_idxs = idxs[start : end] + assert ( + end > start + ), f"Could not split array of shape {arr.shape} with fractions {rows_fractions} !!!" + sub_idxs = idxs[start:end] subarrays.append(arr[sub_idxs]) start = end return subarrays - + def run(self, df: pd.DataFrame) -> list[np.ndarray]: if self.random_state is not None: np.random.seed(self.random_state) - arrays = self.split_by_columns(df, feature_columns=self.feature_columns, target_columns=self.target_columns) + arrays = self.split_by_columns( + df, feature_columns=self.feature_columns, target_columns=self.target_columns + ) idxs = np.random.permutation(len(df.index)) splitted_arrays = [] for arr in arrays: subarrays = self.split_array(arr, idxs, rows_fractions=self.rows_fractions) splitted_arrays.extend(subarrays) return splitted_arrays - + def get_info(self): """Prints this module's docstring.""" print(inspect.getdoc(self)) - + def save_config(self, path: str): - assert path.endswith('.yaml') + assert path.endswith(".yaml") with open(path, "w") as file: yaml.safe_dump(self.configs, file) - + @staticmethod def load_config(path: str): - assert path.endswith('.yaml') - with open(path, 'r') as file: + assert path.endswith(".yaml") + with open(path, "r") as file: return yaml.safe_load(file) - + def save(self): raise NotImplementedError @@ -84,4 +95,4 @@ def reverse(self): raise NotImplementedError def save_data(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/models/__init__.py b/jlab_datascience_toolkit/models/__init__.py index 856c1b4..30ffe9f 100644 --- a/jlab_datascience_toolkit/models/__init__.py +++ b/jlab_datascience_toolkit/models/__init__.py @@ -1,6 +1,10 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules +from jlab_datascience_toolkit.utils.registration import ( + register, + make, + list_registered_modules, +) register( id="KerasMLP_v0", - entry_point="jlab_datascience_toolkit.models.keras_mlp_v0:KerasMLP" + entry_point="jlab_datascience_toolkit.models.keras_mlp_v0:KerasMLP", ) diff --git a/jlab_datascience_toolkit/models/keras_mlp_v0.py b/jlab_datascience_toolkit/models/keras_mlp_v0.py index 00df426..4470592 100644 --- a/jlab_datascience_toolkit/models/keras_mlp_v0.py +++ b/jlab_datascience_toolkit/models/keras_mlp_v0.py @@ -6,68 +6,71 @@ class KerasMLP(JDSTModel): - ''' + """ Defines an MLP model. self is not a keras.Model itself. Instead, it has a "model" attribute which is a keras.Model. - ''' + """ + def __init__(self, configs: dict): - ''' + """ configs has the following keywords: 1) 'input_dim' 2) 'layers_dicts': List of subdictionaries, each subdictionary contains configs of one of: 2.1) 'layer_type': 'Dense', 'layer_configs': keras Dense layer configs 2.2) 'layer_type': 'Dropout', 'layer_configs': keras Dropout layer configs 2.3) 'layer_type': 'BatchNormalization', 'layer_configs': keras BN layer cnfigs - ''' + """ self.configs = configs - inputs = keras.layers.Input(shape=(configs['input_dim'],)) + inputs = keras.layers.Input(shape=(configs["input_dim"],)) outputs = inputs - for layer_dict in configs['layers_dicts']: - layer_type = layer_dict['layer_type'] - layer_configs = layer_dict.get('layer_configs', {}) - if layer_type == 'Dense': + for layer_dict in configs["layers_dicts"]: + layer_type = layer_dict["layer_type"] + layer_configs = layer_dict.get("layer_configs", {}) + if layer_type == "Dense": outputs = keras.layers.Dense(**layer_configs)(outputs) - elif layer_type == 'Dropout': + elif layer_type == "Dropout": outputs = keras.layers.Dropout(**layer_configs)(outputs) - elif layer_type == 'BatchNormalization': + elif layer_type == "BatchNormalization": outputs = keras.layers.BatchNormalization(**layer_configs)(outputs) else: - raise NameError('Unrecognized layer_type !!!') + raise NameError("Unrecognized layer_type !!!") self.model = keras.models.Model(inputs=inputs, outputs=outputs) def predict(self, x): y = self.model.predict(x) return y - + def get_info(self): """Prints this module's docstring.""" print(inspect.getdoc(self)) - + def load(self, folder_path: str): assert os.path.exists(folder_path) - self.load_model(os.path.join(folder_path, 'model.keras')) - loaded_configs = self.load_config(os.path.join(folder_path, 'configs.yaml')) - assert self.configs == loaded_configs, 'Mismatch between configs with which model was instantiated and loaded configs !!!' - + self.load_model(os.path.join(folder_path, "model.keras")) + loaded_configs = self.load_config(os.path.join(folder_path, "configs.yaml")) + assert ( + self.configs == loaded_configs + ), "Mismatch between configs with which model was instantiated and loaded configs !!!" + def load_model(self, path: str): - assert path.endswith('.keras') + assert path.endswith(".keras") self.model = keras.models.load_model(path) - + @staticmethod def load_config(path: str): - assert path.endswith('.yaml') - with open(path, 'r') as file: + assert path.endswith(".yaml") + with open(path, "r") as file: return yaml.safe_load(file) - + def save(self, folder_path: str): os.makedirs(folder_path, exist_ok=True) - self.save_model(os.path.join(folder_path, 'model.keras')) - self.save_config(os.path.join(folder_path, 'configs.yaml')) + self.save_model(os.path.join(folder_path, "model.keras")) + self.save_config(os.path.join(folder_path, "configs.yaml")) def save_model(self, path: str): - assert path.endswith('.keras') + assert path.endswith(".keras") self.model.save(path) - + def save_config(self, path: str): - assert path.endswith('.yaml') + assert path.endswith(".yaml") with open(path, "w") as file: - yaml.dump(self.configs, file) \ No newline at end of file + yaml.dump(self.configs, file) diff --git a/jlab_datascience_toolkit/trainers/__init__.py b/jlab_datascience_toolkit/trainers/__init__.py index 6eb01c1..b32f9e1 100644 --- a/jlab_datascience_toolkit/trainers/__init__.py +++ b/jlab_datascience_toolkit/trainers/__init__.py @@ -1,6 +1,10 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules +from jlab_datascience_toolkit.utils.registration import ( + register, + make, + list_registered_modules, +) register( id="KerasTrainer_v0", - entry_point="jlab_datascience_toolkit.trainers.keras_trainer_v0:Trainer" + entry_point="jlab_datascience_toolkit.trainers.keras_trainer_v0:Trainer", ) diff --git a/jlab_datascience_toolkit/trainers/keras_trainer_v0.py b/jlab_datascience_toolkit/trainers/keras_trainer_v0.py index 1ddeb5d..8fac3c8 100644 --- a/jlab_datascience_toolkit/trainers/keras_trainer_v0.py +++ b/jlab_datascience_toolkit/trainers/keras_trainer_v0.py @@ -9,9 +9,9 @@ class Trainer(JDSTTrainer): - ''' - Trains a given JDSTModel object "model" that has a keras.Model attribute (i.e., model.model is a keras.Model) on given data and training configurations. - + """ + Trains a given JDSTModel object "model" that has a keras.Model attribute (i.e., model.model is a keras.Model) on given data and training configurations. + Arguments of keras.Model.fit are divided into: 1. Configurations passed in "configs" of Trainer.__init__ method. These are: a) batch_size=None @@ -31,7 +31,7 @@ class Trainer(JDSTTrainer): b) y c) validation_data d) sample_weight - + In addition to the list in (1.), two additional items are part of the training configurations: 1) "loss_configs" 2) "optimizer_configs" @@ -48,76 +48,82 @@ class Trainer(JDSTTrainer): Saves object's configurations to a given path load_config() Satatic method loading configurations from a given path - ''' + """ def __init__(self, configs: dict): self.configs = configs - self.settings = configs.copy() # Must be separate from configs as it can include actual keras callback objects - self.settings.pop('registered_name') + self.settings = ( + configs.copy() + ) # Must be separate from configs as it can include actual keras callback objects + self.settings.pop("registered_name") # 1) Loss - loss_configs = self.settings.pop('loss_configs') - loss_type = loss_configs.pop('loss_type') - if loss_type == 'CategoricalCrossentropy': + loss_configs = self.settings.pop("loss_configs") + loss_type = loss_configs.pop("loss_type") + if loss_type == "CategoricalCrossentropy": self.loss = keras.losses.CategoricalCrossentropy(**loss_configs) - elif loss_type == 'SparseCategoricalCrossentropy': + elif loss_type == "SparseCategoricalCrossentropy": self.loss = keras.losses.SparseCategoricalCrossentropy(**loss_configs) else: - raise NameError(f'Unrecognized loss_type ({loss_type}) !!!') - + raise NameError(f"Unrecognized loss_type ({loss_type}) !!!") + # 2) Optimizer - optimizer_configs = self.settings.pop('optimizer_configs') - optimizer_type = optimizer_configs.pop('optimizer_type') - if optimizer_type == 'Adam': + optimizer_configs = self.settings.pop("optimizer_configs") + optimizer_type = optimizer_configs.pop("optimizer_type") + if optimizer_type == "Adam": self.optimizer = keras.optimizers.Adam(**optimizer_configs) - elif optimizer_type == 'RMSprop': + elif optimizer_type == "RMSprop": self.optimizer = keras.optimizers.RMSprop(**optimizer_configs) else: - raise NameError(f'Unrecognized optimizer_type ({optimizer_type}) !!!') + raise NameError(f"Unrecognized optimizer_type ({optimizer_type}) !!!") # 3) OPTIONAL Callbacks callbacks = [] - for callback_configs in self.settings.get('callbacks', []): + for callback_configs in self.settings.get("callbacks", []): callback_configs = callback_configs.copy() - callback_type = callback_configs.pop('callback_type') - if callback_type == 'EarlyStopping': + callback_type = callback_configs.pop("callback_type") + if callback_type == "EarlyStopping": callbacks.append(keras.callbacks.EarlyStopping(**callback_configs)) - elif callback_type == 'ReduceLROnPlateau': + elif callback_type == "ReduceLROnPlateau": callbacks.append(keras.callbacks.ReduceLROnPlateau(**callback_configs)) else: - raise NameError('Unrecognized callback_type !!!') - self.settings['callbacks'] = None if len(callbacks) == 0 else callbacks + raise NameError("Unrecognized callback_type !!!") + self.settings["callbacks"] = None if len(callbacks) == 0 else callbacks # 4) Check on "class_weight" - if self.settings.get('class_weight', None) is not None: - trainer_log.warning('Make sure indices of classes in "class_weight" match indices of "y" !') - + if self.settings.get("class_weight", None) is not None: + trainer_log.warning( + 'Make sure indices of classes in "class_weight" match indices of "y" !' + ) def fit(self, model, x=None, y=None, validation_data=None, sample_weight=None): model.model.compile(optimizer=self.optimizer, loss=self.loss) history = model.model.fit( - x=x, y=y, validation_data=validation_data, sample_weight=sample_weight, **self.settings + x=x, + y=y, + validation_data=validation_data, + sample_weight=sample_weight, + **self.settings, ) return history - def get_info(self): """Prints this module's docstring.""" print(inspect.getdoc(self)) - + def save_config(self, path: str): - assert path.endswith('.yaml') + assert path.endswith(".yaml") with open(path, "w") as file: yaml.safe_dump(self.configs, file) - + @staticmethod def load_config(path: str): - assert path.endswith('.yaml') - with open(path, 'r') as file: + assert path.endswith(".yaml") + with open(path, "r") as file: return yaml.safe_load(file) - + def save(self): raise NotImplementedError def load(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/jlab_datascience_toolkit/utils/io.py b/jlab_datascience_toolkit/utils/io.py index 14569da..8b3fcc2 100644 --- a/jlab_datascience_toolkit/utils/io.py +++ b/jlab_datascience_toolkit/utils/io.py @@ -5,14 +5,15 @@ import sys from typing import Union -io_log = logging.getLogger('io_log') +io_log = logging.getLogger("io_log") + def save_yaml_config(config: dict, path: Union[str, Path], overwrite: bool = False): - """ Saves configuration dictionary to a yaml file + """Saves configuration dictionary to a yaml file Args: config (dict): Dictionary to save - path (str | Path): Location to save configuration. + path (str | Path): Location to save configuration. If `path` does not exist, it will be created. If `path` is a directory, the configuration will be saved to config.yaml If `path` is a filename, the configuration will be saved to that filename @@ -25,27 +26,30 @@ def save_yaml_config(config: dict, path: Union[str, Path], overwrite: bool = Fal path = Path(path) if path.is_dir(): - io_log.info('path.is_dir() == True') - path = path.joinpath('config.yaml') + io_log.info("path.is_dir() == True") + path = path.joinpath("config.yaml") path.parent.mkdir(exist_ok=True) if path.exists() and not overwrite: - io_log.error(f'File {path} exists without overwrite flag set') - raise FileExistsError('File already exists. Set overwrite=True if you would like to overwrite it.') - - with open(path, 'w') as f: - io_log.info(f'Writing config to {path}') + io_log.error(f"File {path} exists without overwrite flag set") + raise FileExistsError( + "File already exists. Set overwrite=True if you would like to overwrite it." + ) + + with open(path, "w") as f: + io_log.info(f"Writing config to {path}") yaml.safe_dump(config, f) + def load_yaml_config(path: Union[str, Path]): path = Path(path) if path.is_dir(): - path = path.joinpath('config.yaml') - + path = path.joinpath("config.yaml") + if not path.exists(): - io_log.error(f'Configuration file {path} not found.') - raise FileNotFoundError(f'Configuration file {path} not found.') + io_log.error(f"Configuration file {path} not found.") + raise FileNotFoundError(f"Configuration file {path} not found.") - with open(path, 'r') as f: + with open(path, "r") as f: return yaml.safe_load(f) diff --git a/jlab_datascience_toolkit/utils/parser_utilities.py b/jlab_datascience_toolkit/utils/parser_utilities.py index b294ba9..f97581b 100644 --- a/jlab_datascience_toolkit/utils/parser_utilities.py +++ b/jlab_datascience_toolkit/utils/parser_utilities.py @@ -3,39 +3,42 @@ import yaml import pandas as pd + def save_config_to_yaml(config, path): save_path = pathlib.Path(path) os.makedirs(save_path) - with open(save_path.joinpath('config.yaml'), 'w') as f: + with open(save_path.joinpath("config.yaml"), "w") as f: yaml.safe_dump(self.config, f) + def load_yaml_config(path): base_path = Path(path) - with open(base_path.joinpath('config.yaml'), 'r') as f: + with open(base_path.joinpath("config.yaml"), "r") as f: config = yaml.safe_load(f) return config + def read_data_to_pandas(filepaths: list, file_format: str, **kwargs) -> pd.DataFrame: - """ Loads all files listed in filepaths and reads them. - All kwargs other than filepaths and file_format will be passed to the read_function - for its associated file_format - - Returns: - pd.DataFrame: A single DataFrame containing list of dataframes - """ - - # Supported file formats - read_functions = dict( - csv=pd.read_csv, - feather=pd.read_feather, - json=pd.read_json, - pickle=pd.read_pickle - ) - - data_list = [] - read_function = read_functions[file_format] - for file in filepaths: - data = read_function(file, **kwargs) - data_list.append(data) - - return data_list \ No newline at end of file + """Loads all files listed in filepaths and reads them. + All kwargs other than filepaths and file_format will be passed to the read_function + for its associated file_format + + Returns: + pd.DataFrame: A single DataFrame containing list of dataframes + """ + + # Supported file formats + read_functions = dict( + csv=pd.read_csv, + feather=pd.read_feather, + json=pd.read_json, + pickle=pd.read_pickle, + ) + + data_list = [] + read_function = read_functions[file_format] + for file in filepaths: + data = read_function(file, **kwargs) + data_list.append(data) + + return data_list diff --git a/jlab_datascience_toolkit/utils/registration.py b/jlab_datascience_toolkit/utils/registration.py index 27b7441..29a0000 100644 --- a/jlab_datascience_toolkit/utils/registration.py +++ b/jlab_datascience_toolkit/utils/registration.py @@ -6,7 +6,7 @@ def load(name): mod_name, attr_name = name.split(":") - print(f'Attempting to load {mod_name} with {attr_name}') + print(f"Attempting to load {mod_name} with {attr_name}") mod = importlib.import_module(mod_name) fn = getattr(mod, attr_name) return fn @@ -21,11 +21,15 @@ def __init__(self, id, entry_point=None, kwargs=None): def make(self, **kwargs): """Instantiates an instance of data module with appropriate kwargs""" if self.entry_point is None: - module_log.error('Attempting to make deprecated module {}. \ + module_log.error( + "Attempting to make deprecated module {}. \ (HINT: is there a newer registered version \ - of this module?)'.format(self.id)) + of this module?)".format( + self.id + ) + ) raise RuntimeError - + _kwargs = self._kwargs.copy() _kwargs.update(kwargs) if callable(self.entry_point): @@ -43,9 +47,9 @@ def __init__(self): def make(self, path, **kwargs): if len(kwargs) > 0: - module_log.info('Making new module: %s (%s)', path, kwargs) + module_log.info("Making new module: %s (%s)", path, kwargs) else: - module_log.info('Making new module: %s', path) + module_log.info("Making new module: %s", path) module_spec = self.spec(path) module = module_spec.make(**kwargs) @@ -55,14 +59,18 @@ def all(self): return self.module_specs.values() def spec(self, path): - if ':' in path: - mod_name, _sep, id = path.partition(':') + if ":" in path: + mod_name, _sep, id = path.partition(":") try: importlib.import_module(mod_name) except ImportError: - module_log.error('A module ({}) was specified for the module but was not found, \ + module_log.error( + "A module ({}) was specified for the module but was not found, \ make sure the package is installed with `pip install` before \ - calling `module.make()`'.format(mod_name)) + calling `module.make()`".format( + mod_name + ) + ) raise else: @@ -71,12 +79,12 @@ def spec(self, path): try: return self.module_specs[id] except KeyError: - module_log.error('No registered module with id: {}'.format(id)) - raise + module_log.error("No registered module with id: {}".format(id)) + raise def register(self, id, **kwargs): if id in self.module_specs: - module_log.error('Cannot re-register id: {}'.format(id)) + module_log.error("Cannot re-register id: {}".format(id)) raise RuntimeError self.module_specs[id] = ModuleSpec(id, **kwargs) @@ -96,5 +104,6 @@ def make(id, **kwargs): def spec(id): return module_registry.spec(id) + def list_registered_modules(): return list(module_registry.module_specs.keys()) diff --git a/jlab_datascience_toolkit/workflows/example_workflow_v0.py b/jlab_datascience_toolkit/workflows/example_workflow_v0.py index 8f0478f..4e99620 100644 --- a/jlab_datascience_toolkit/workflows/example_workflow_v0.py +++ b/jlab_datascience_toolkit/workflows/example_workflow_v0.py @@ -8,29 +8,39 @@ from jlab_datascience_toolkit.analysis import make as make_analysis -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--cfg_file", type=str, default="../cfgs/defaults/multiclass_cfg.yaml", help='Path to yaml configuration file') - parser.add_argument("--logdir", type=str, default="./", help='Path logging directory. If None, analysis figures will not be saved.') + parser.add_argument( + "--cfg_file", + type=str, + default="../cfgs/defaults/multiclass_cfg.yaml", + help="Path to yaml configuration file", + ) + parser.add_argument( + "--logdir", + type=str, + default="./", + help="Path logging directory. If None, analysis figures will not be saved.", + ) args = parser.parse_args() - args = vars(args) # convert args from argparse.Namespace to dict + args = vars(args) # convert args from argparse.Namespace to dict - with open(args['cfg_file'], 'r') as file: + with open(args["cfg_file"], "r") as file: configs = yaml.safe_load(file) - prep_configs = configs['prep_configs'] - model_configs = configs['model_configs'] - trainer_configs = configs['trainer_configs'] - analysis_configs = configs['analysis_configs'] + prep_configs = configs["prep_configs"] + model_configs = configs["model_configs"] + trainer_configs = configs["trainer_configs"] + analysis_configs = configs["analysis_configs"] # 1) Load Data - df = sns.load_dataset('iris') - classes_list = [(c, i) for i, c in enumerate(df['species'].unique().tolist())] - df['species_int'] = df['species'].map(dict(classes_list)) - + df = sns.load_dataset("iris") + classes_list = [(c, i) for i, c in enumerate(df["species"].unique().tolist())] + df["species_int"] = df["species"].map(dict(classes_list)) + # 2) Split Data - prep = make_prep(prep_configs['registered_name'], configs=prep_configs) + prep = make_prep(prep_configs["registered_name"], configs=prep_configs) x_train, x_val, x_test, y_train, y_val, y_test = prep.run(df) - + # 3) Scaling scaler = StandardScaler() x_train = scaler.fit_transform(x_train) @@ -38,21 +48,25 @@ x_test = scaler.transform(x_test) # 4) Define Model - model = make_model(model_configs['registered_name'], configs=model_configs) - + model = make_model(model_configs["registered_name"], configs=model_configs) + # 5) Train Model - trainer = make_trainer(trainer_configs['registered_name'], configs=trainer_configs) - history = trainer.fit(model=model, x=x_train, y=y_train, validation_data=(x_val, y_val)) + trainer = make_trainer(trainer_configs["registered_name"], configs=trainer_configs) + history = trainer.fit( + model=model, x=x_train, y=y_train, validation_data=(x_val, y_val) + ) # 6) Analyze Model on test dataset - y_pred = model.predict(x_test) # (n_samples, c_classes) - y_pred = y_pred.argmax(axis=1) # (n_samples) - multiclass_ana = make_analysis(analysis_configs["registered_name"], configs=analysis_configs) + y_pred = model.predict(x_test) # (n_samples, c_classes) + y_pred = y_pred.argmax(axis=1) # (n_samples) + multiclass_ana = make_analysis( + analysis_configs["registered_name"], configs=analysis_configs + ) results = multiclass_ana.run( y_test, y_pred, - labels = [tup[1] for tup in classes_list], - target_names = [tup[0] for tup in classes_list], - logdir = args['logdir'] + labels=[tup[1] for tup in classes_list], + target_names=[tup[0] for tup in classes_list], + logdir=args["logdir"], ) - print(results) \ No newline at end of file + print(results) diff --git a/setup.py b/setup.py index 52b5113..2b637a9 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from setuptools import setup, find_packages setup( - name='jlab_datascience_toolkit', - version='0.1', - description='JLab datascience toolkit for composable workflows', - author='JLab DataScience Department', - author_email='schram@jlab.org, kishan@jlab.org, dianam@jlab.org, dlersch@jlab.org', + name="jlab_datascience_toolkit", + version="0.1", + description="JLab datascience toolkit for composable workflows", + author="JLab DataScience Department", + author_email="schram@jlab.org, kishan@jlab.org, dianam@jlab.org, dlersch@jlab.org", packages=find_packages(), - install_requires=[] + install_requires=[], ) diff --git a/utests/utest_csv_parser.py b/utests/utest_csv_parser.py index 3bc3822..1ee1344 100644 --- a/utests/utest_csv_parser.py +++ b/utests/utest_csv_parser.py @@ -9,10 +9,10 @@ import sys import os -test_log = logging.Logger('test_logger') +test_log = logging.Logger("test_logger") rng = np.random.default_rng(seed=42) -parser_id = 'CSVParser_v0' +parser_id = "CSVParser_v0" class TestCSVParserv0(unittest.TestCase): @@ -24,42 +24,44 @@ def __init__(self, *args, **kwargs): @classmethod def setUpClass(self) -> None: - print('Setting up all tests...') - self.columns = ['R121GMES', 'R122GMES', - 'R123GMES', 'R121GSET', 'R122GSET', 'R123GSET'] - self.path = './csv_parser_utest.csv' + print("Setting up all tests...") + self.columns = [ + "R121GMES", + "R122GMES", + "R123GMES", + "R121GSET", + "R122GSET", + "R123GSET", + ] + self.path = "./csv_parser_utest.csv" self.samples = 100 - data = rng.normal(loc=5, scale=1, size=( - self.samples, len(self.columns))) + data = rng.normal(loc=5, scale=1, size=(self.samples, len(self.columns))) dates = [] - for i in range(1, self.samples+1): - dates.append(np.datetime64( - f'2010-03-24 10:{i//60:02d}:{i % 60:02d}')) + for i in range(1, self.samples + 1): + dates.append(np.datetime64(f"2010-03-24 10:{i//60:02d}:{i % 60:02d}")) test_data = pd.DataFrame(data, columns=self.columns, index=dates) - test_data.index.name = 'Date' + test_data.index.name = "Date" test_data test_data.to_csv(self.path) - self.path2 = './csv_parser_utest2.csv' - data = rng.normal(loc=9, scale=2, size=( - self.samples, len(self.columns))) + self.path2 = "./csv_parser_utest2.csv" + data = rng.normal(loc=9, scale=2, size=(self.samples, len(self.columns))) dates = [] - for i in range(1, self.samples+1): - dates.append(np.datetime64( - f'2010-03-25 09:{i//60:02d}:{i % 60:02d}')) + for i in range(1, self.samples + 1): + dates.append(np.datetime64(f"2010-03-25 09:{i//60:02d}:{i % 60:02d}")) test_data = pd.DataFrame(data, columns=self.columns, index=dates) - test_data.index.name = 'Date' + test_data.index.name = "Date" test_data test_data.to_csv(self.path2) @classmethod def tearDownClass(self) -> None: - print('Removing temporary files...') + print("Removing temporary files...") os.remove(self.path) os.remove(self.path2) - print('Have a good day!') + print("Have a good day!") def setUp(self) -> None: print() @@ -70,70 +72,75 @@ def tearDown(self) -> None: return super().tearDown() def test_no_config(self): - print('*****No Config Test*****\n') + print("*****No Config Test*****\n") parser = make(parser_id) output = parser.load_data() self.assertIsNone(output) def test_string_filepaths(self): - print('*****String Filepaths Test*****\n') + print("*****String Filepaths Test*****\n") parser = make(parser_id, config=dict(filepaths=self.path)) output = parser.load_data() - print('Output Head:\n', output.head()) + print("Output Head:\n", output.head()) - self.assertEqual(output.shape, (self.samples, len(self.columns)+1)) + self.assertEqual(output.shape, (self.samples, len(self.columns) + 1)) def test_one_item_list_filepaths(self): - print('*****One Item List Test*****\n') + print("*****One Item List Test*****\n") parser = make(parser_id, config=dict(filepaths=[self.path])) output = parser.load_data() - print('Output Head:\n', output.head()) - self.assertEqual(output.shape, (self.samples, len(self.columns)+1)) + print("Output Head:\n", output.head()) + self.assertEqual(output.shape, (self.samples, len(self.columns) + 1)) def test_two_filepaths(self): - print('*****Two Filepaths Test*****\n') + print("*****Two Filepaths Test*****\n") parser = make(parser_id, config=dict(filepaths=[self.path, self.path2])) output = parser.load_data() - print('Output Head:\n', output.head()) - print('Output shape:', output.shape) - self.assertEqual(output.shape, (2*self.samples, len(self.columns)+1)) + print("Output Head:\n", output.head()) + print("Output shape:", output.shape) + self.assertEqual(output.shape, (2 * self.samples, len(self.columns) + 1)) def test_usecols_read_arg(self): - print('*****Usecols Read Arg Test*****\n') + print("*****Usecols Read Arg Test*****\n") - two_columns = ['R121GMES', 'R121GSET'] - parser = make(parser_id, config=dict( - filepaths=self.path, read_kwargs=dict(usecols=two_columns))) + two_columns = ["R121GMES", "R121GSET"] + parser = make( + parser_id, + config=dict(filepaths=self.path, read_kwargs=dict(usecols=two_columns)), + ) output = parser.load_data() - print('Output Head:\n', output.head()) + print("Output Head:\n", output.head()) self.assertEqual(output.shape, (self.samples, 2)) self.assertEqual(set(output.columns), set(two_columns)) def test_use_datetime_index(self): - print('*****Use Datetime Index Test*****\n') - - def column_lambda(x): return ('GMES' in x) or (x == 'Date') - read_kwargs = dict(usecols=column_lambda, - index_col='Date', parse_dates=True) - parser = make(parser_id, - config=dict( - filepaths=self.path, read_kwargs=read_kwargs) - ) + print("*****Use Datetime Index Test*****\n") + + def column_lambda(x): + return ("GMES" in x) or (x == "Date") + + read_kwargs = dict(usecols=column_lambda, index_col="Date", parse_dates=True) + parser = make( + parser_id, config=dict(filepaths=self.path, read_kwargs=read_kwargs) + ) output = parser.load_data() - print('Output Head:\n', output.head()) + print("Output Head:\n", output.head()) self.assertEqual(output.shape, (self.samples, 3)) for column in output.columns: - self.assertTrue('GMES' in column) + self.assertTrue("GMES" in column) self.assertIsInstance(output.index, pd.DatetimeIndex) def test_save_load(self): - print('*****Save/Load Test*****\n') + print("*****Save/Load Test*****\n") - parser = make(parser_id, config=dict(filepaths=self.path, read_kwargs={'usecols': self.columns})) + parser = make( + parser_id, + config=dict(filepaths=self.path, read_kwargs={"usecols": self.columns}), + ) output = parser.load_data() - save_path = './temp_parser' + save_path = "./temp_parser" try: parser.save(save_path) new_parser = make(parser_id) @@ -146,9 +153,10 @@ def test_save_load(self): shutil.rmtree(save_path) pass + # Run this file via: python utest_csv_parser_v0.py if __name__ == "__main__": argv = len(sys.argv) > 1 and sys.argv[1] - loglevel = logging.DEBUG if argv == '-v' else logging.WARNING + loglevel = logging.DEBUG if argv == "-v" else logging.WARNING logging.basicConfig(stream=sys.stdout, level=loglevel) unittest.main() diff --git a/utests/utest_io_utils.py b/utests/utest_io_utils.py index 17acfb6..e1ba8b4 100644 --- a/utests/utest_io_utils.py +++ b/utests/utest_io_utils.py @@ -11,9 +11,11 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + def generate_random_string(length): alphanumeric = string.ascii_letters + string.digits - return ''.join(random.choice(alphanumeric) for _ in range(length)) + return "".join(random.choice(alphanumeric) for _ in range(length)) + class TestIOUtils(unittest.TestCase): @@ -24,31 +26,33 @@ def __init__(self, *args, **kwargs): @classmethod def setUpClass(self) -> None: - print('Setting up all tests...') - self.config = {'name': 'test', 'scale': 1, 'list_example': [0.1, 1.2, 2.3]} - self.test_path = Path('./temp_dir_' + generate_random_string(6)) - self.existing_file = self.test_path.joinpath('existing_file.yaml') + print("Setting up all tests...") + self.config = {"name": "test", "scale": 1, "list_example": [0.1, 1.2, 2.3]} + self.test_path = Path("./temp_dir_" + generate_random_string(6)) + self.existing_file = self.test_path.joinpath("existing_file.yaml") @classmethod def tearDownClass(self) -> None: - print('\nHave a good day!') + print("\nHave a good day!") def setUp(self) -> None: - print('\n----------------------------------------------------------------------') + print( + "\n----------------------------------------------------------------------" + ) os.makedirs(self.test_path) - with open(self.existing_file, 'w'): + with open(self.existing_file, "w"): pass return super().setUp() def tearDown(self) -> None: # print('\nEnd of Test') - print('----------------------------------------------------------------------') + print("----------------------------------------------------------------------") shutil.rmtree(self.test_path) return super().tearDown() def test_save_load_with_dir(self): save_yaml_config(self.config, self.test_path) - self.assertTrue(self.test_path.joinpath('config.yaml').exists()) + self.assertTrue(self.test_path.joinpath("config.yaml").exists()) config = load_yaml_config(self.test_path) for k in self.config: self.assertEqual(self.config[k], config[k]) @@ -59,27 +63,28 @@ def test_save_existing_no_overwrite(self): def test_load_not_existing(self): with self.assertRaises(FileNotFoundError): - load_yaml_config(self.test_path.joinpath('no_file_exists_here.yaml')) + load_yaml_config(self.test_path.joinpath("no_file_exists_here.yaml")) def test_save_load_filename(self): - new_filename = self.test_path.joinpath('new_config.yaml') + new_filename = self.test_path.joinpath("new_config.yaml") save_yaml_config(self.config, new_filename) load_yaml_config(new_filename) def test_overwrite_filename(self): - # We will simply try saving the same thing three times. First should succeed, - # second should fail with overwrite==False, third should succeed with overwrite==True - new_filename = self.test_path.joinpath('new_config.yaml') + # We will simply try saving the same thing three times. First should succeed, + # second should fail with overwrite==False, third should succeed with overwrite==True + new_filename = self.test_path.joinpath("new_config.yaml") save_yaml_config(self.config, new_filename) with self.assertRaises(FileExistsError): save_yaml_config(self.config, new_filename) config = self.config.copy() - config['name'] = 'train' + config["name"] = "train" save_yaml_config(config, new_filename, overwrite=True) loaded_config = load_yaml_config(new_filename) for k in config: self.assertEqual(config[k], loaded_config[k]) + # Run this file via: python utest_io_utils.py if __name__ == "__main__": unittest.main() diff --git a/utests/utest_keras_mlp_v0.py b/utests/utest_keras_mlp_v0.py index 1f2159b..f74d7bf 100644 --- a/utests/utest_keras_mlp_v0.py +++ b/utests/utest_keras_mlp_v0.py @@ -11,24 +11,30 @@ def setUp(cls): "registered_name": "KerasMLP_v0", "input_dim": 4, "layers_dicts": [ - {"layer_type": "Dense", "layer_configs": {"units": 10, "activation": "relu"}}, + { + "layer_type": "Dense", + "layer_configs": {"units": 10, "activation": "relu"}, + }, {"layer_type": "BatchNormalization"}, {"layer_type": "Dropout", "layer_configs": {"rate": 0.05}}, - {"layer_type": "Dense", "layer_configs": {"units": 3, "activation": "softmax"}} - ] + { + "layer_type": "Dense", + "layer_configs": {"units": 3, "activation": "softmax"}, + }, + ], } cls.model = make_model(cls.configs["registered_name"], configs=cls.configs) cls.x = np.random.rand(100, 4) - cls.model_folder = './model_folder/' - + cls.model_folder = "./model_folder/" + def test_predict(self): y_pred = self.model.predict(self.x) self.assertTrue(y_pred.shape == (100, 3)) - + def test_save_and_load(self): y_pred_old = self.model.predict(self.x) self.model.save(self.model_folder) - model_new = make_model(self.configs['registered_name'], configs=self.configs) + model_new = make_model(self.configs["registered_name"], configs=self.configs) model_new.load(self.model_folder) y_pred_new = model_new.predict(self.x) self.assertTrue(np.array_equal(y_pred_old, y_pred_new)) @@ -38,5 +44,5 @@ def tearDownClass(cls): shutil.rmtree(cls.model_folder) -if __name__ == '__main__': - unittest.main() \ No newline at end of file +if __name__ == "__main__": + unittest.main() diff --git a/utests/utest_numpy_minmax_scaler.py b/utests/utest_numpy_minmax_scaler.py index d00fcb3..f7332e1 100644 --- a/utests/utest_numpy_minmax_scaler.py +++ b/utests/utest_numpy_minmax_scaler.py @@ -5,12 +5,13 @@ import os import shutil + class UTestNumpyMinMaxScaler(unittest.TestCase): # Initialize: - #***************************************** - def __init__(self,*args, **kwargs): - super(UTestNumpyMinMaxScaler,self).__init__(*args, **kwargs) + # ***************************************** + def __init__(self, *args, **kwargs): + super(UTestNumpyMinMaxScaler, self).__init__(*args, **kwargs) # Get an into: print(" ") @@ -20,37 +21,43 @@ def __init__(self,*args, **kwargs): print("* *") print("***************************************") print(" ") - #***************************************** + + # ***************************************** # Test the min max scaler: - #***************************************** + # ***************************************** def test_drive_numpy_minmax_scaler(self): # Create some data first, that we wish to scale: print("Create test data...") - test_data = np.random.uniform(5.0,10.0,size=(5000,1)) + test_data = np.random.uniform(5.0, 10.0, size=(5000, 1)) print("...done!") print(" ") - #Now load the scaler by defining a user config first: + # Now load the scaler by defining a user config first: print("Load numpy min max scaler...") this_file_loc = os.path.dirname(__file__) - cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/numpy_minmax_scaler_cfg.yaml') - param_store_loc = this_file_loc + '/numpy_minmax_scaler_params' - scaler_cfg = {'feature_range':(-1.0,1.0),'store_loc':param_store_loc} - npy_scaler = preps.make("NumpyMinMaxScaler_v0",path_to_cfg=cfg_loc,user_config=scaler_cfg) - + cfg_loc = os.path.join( + this_file_loc, + "../jlab_datascience_toolkit/cfgs/defaults/numpy_minmax_scaler_cfg.yaml", + ) + param_store_loc = this_file_loc + "/numpy_minmax_scaler_params" + scaler_cfg = {"feature_range": (-1.0, 1.0), "store_loc": param_store_loc} + npy_scaler = preps.make( + "NumpyMinMaxScaler_v0", path_to_cfg=cfg_loc, user_config=scaler_cfg + ) + # Print the module info: npy_scaler.get_info() - + print("...done!") print(" ") # Run the scaler: print("Scale data...") - + scaled_data = npy_scaler.run(test_data) print("...done!") @@ -69,21 +76,23 @@ def test_drive_numpy_minmax_scaler(self): pass_range_check_2 = False print("Run sanity checks...") - + # Check scaled data: - if round(np.min(scaled_data),1) == -1.0 and round(np.max(scaled_data)) == 1.0: + if round(np.min(scaled_data), 1) == -1.0 and round(np.max(scaled_data)) == 1.0: pass_range_check_1 = True # Check if the unscaled data has the same limits as the original test data: - if round(np.min(test_data),1) == round(np.min(unscaled_data),1) and round(np.max(test_data),1) == round(np.max(unscaled_data),1): - pass_range_check_2 = True + if round(np.min(test_data), 1) == round(np.min(unscaled_data), 1) and round( + np.max(test_data), 1 + ) == round(np.max(unscaled_data), 1): + pass_range_check_2 = True print("...done!") print(" ") # Store and load the scaler parameters --> We want to see that the module checkpointing is working print("Store and retreive scaler parameters...") - + pass_checkpointing = False # Store the params: npy_scaler.save() @@ -93,7 +102,7 @@ def test_drive_numpy_minmax_scaler(self): # If everything went right, there should be a file with scaling parameters and the param dictionary # should not be empty: - if os.path.exists(scaler_cfg['store_loc']) and bool(param_dict): + if os.path.exists(scaler_cfg["store_loc"]) and bool(param_dict): pass_checkpointing = True print("...done!") @@ -102,7 +111,7 @@ def test_drive_numpy_minmax_scaler(self): # Clean up: print("Remove created data...") - shutil.rmtree('numpy_minmax_scaler_params') + shutil.rmtree("numpy_minmax_scaler_params") print("...done!") print(" ") @@ -111,7 +120,7 @@ def test_drive_numpy_minmax_scaler(self): passTypeChecker = False print("Test type checker (an error message should show up below this line)...") - val = npy_scaler.run([1,2,3,4]) + val = npy_scaler.run([1, 2, 3, 4]) if val is None: passTypeChecker = True @@ -119,10 +128,16 @@ def test_drive_numpy_minmax_scaler(self): print("...done!") print(" ") - self.assertTrue(pass_range_check_1 & pass_range_check_2 & pass_checkpointing & passTypeChecker) + self.assertTrue( + pass_range_check_1 + & pass_range_check_2 + & pass_checkpointing + & passTypeChecker + ) print("Have a great day!") - #***************************************** + + # ***************************************** # Run this file via: python utest_numpy_parser.py diff --git a/utests/utest_numpy_parser.py b/utests/utest_numpy_parser.py index b82c7da..0bafd2d 100644 --- a/utests/utest_numpy_parser.py +++ b/utests/utest_numpy_parser.py @@ -4,12 +4,13 @@ import matplotlib.pyplot as plt import os + class UTestNumpyParser(unittest.TestCase): # Initialize: - #***************************************** - def __init__(self,*args, **kwargs): - super(UTestNumpyParser,self).__init__(*args, **kwargs) + # ***************************************** + def __init__(self, *args, **kwargs): + super(UTestNumpyParser, self).__init__(*args, **kwargs) # Get an into: print(" ") @@ -19,10 +20,11 @@ def __init__(self,*args, **kwargs): print("* *") print("*******************************") print(" ") - #***************************************** - + + # ***************************************** + # Test drive the parser: - #***************************************** + # ***************************************** def test_drive_numpy_parser(self): print("Create test data set(s)...") @@ -30,16 +32,19 @@ def test_drive_numpy_parser(self): n_sets = 3 n_events = 5000 # Start with the names that we will use later for the numpy parser: - data_locs = ['data_'+str(i)+'.npy' for i in range(n_sets)] + data_locs = ["data_" + str(i) + ".npy" for i in range(n_sets)] # Features of the data, that are not important for this test: - data_means = [-5.0,0.0,5.0] - data_widths = [0.5]*n_sets + data_means = [-5.0, 0.0, 5.0] + data_widths = [0.5] * n_sets # Create and store the test data set(s) - #+++++++++++++++++++ + # +++++++++++++++++++ for i in range(n_sets): - np.save(data_locs[i],np.random.normal(data_means[i],data_widths[i],size=n_events)) - #+++++++++++++++++++ + np.save( + data_locs[i], + np.random.normal(data_means[i], data_widths[i], size=n_events), + ) + # +++++++++++++++++++ print("...done!") print(" ") @@ -48,10 +53,15 @@ def test_drive_numpy_parser(self): # so we need to provide an additional config that allows us to overwrite the default setting (which is simply "") print("Load numpy parser...") - parser_cfg = {'data_loc':data_locs} + parser_cfg = {"data_loc": data_locs} this_file_loc = os.path.dirname(__file__) - cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/numpy_parser_cfg.yaml') - npy_parser = parsers.make("NumpyParser_v0",path_to_cfg=cfg_loc,user_config=parser_cfg) + cfg_loc = os.path.join( + this_file_loc, + "../jlab_datascience_toolkit/cfgs/defaults/numpy_parser_cfg.yaml", + ) + npy_parser = parsers.make( + "NumpyParser_v0", path_to_cfg=cfg_loc, user_config=parser_cfg + ) # Lets see if we can call the information about this module: npy_parser.get_info() @@ -72,8 +82,8 @@ def test_drive_numpy_parser(self): print("Run dimensional check on parsed data...") passDimensionCheck = False - if test_data.shape[0] == n_events*n_sets: - passDimensionCheck = True + if test_data.shape[0] == n_events * n_sets: + passDimensionCheck = True print("...done!") print(" ") @@ -81,15 +91,15 @@ def test_drive_numpy_parser(self): # Plot the test data: print("Visualize data...") - plt.rcParams.update({'font.size':20}) - fig, ax = plt.subplots(figsize=(12,8)) + plt.rcParams.update({"font.size": 20}) + fig, ax = plt.subplots(figsize=(12, 8)) - ax.hist(test_data,100) - ax.set_xlabel('Data') - ax.set_ylabel('Entries') + ax.hist(test_data, 100) + ax.set_xlabel("Data") + ax.set_ylabel("Entries") ax.grid(True) - fig.savefig('numpy_parser_data.png') + fig.savefig("numpy_parser_data.png") plt.close(fig) print("...done!") @@ -98,19 +108,21 @@ def test_drive_numpy_parser(self): # Clean up everything: print("Remove test data set(s)...") - #+++++++++++++++++++ + # +++++++++++++++++++ for i in range(n_sets): - os.remove(data_locs[i]) - #+++++++++++++++++++ + os.remove(data_locs[i]) + # +++++++++++++++++++ print("...done!") print(" ") - + # Check that we passed the dimension test: self.assertTrue(passDimensionCheck) print("Have a great day!") - #***************************************** + + # ***************************************** + # Run this file via: python utest_numpy_parser.py if __name__ == "__main__": diff --git a/utests/utest_pandas_standard_scaler.py b/utests/utest_pandas_standard_scaler.py index 562d580..1024609 100644 --- a/utests/utest_pandas_standard_scaler.py +++ b/utests/utest_pandas_standard_scaler.py @@ -11,7 +11,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) rng = np.random.default_rng(seed=42) -prep_id = 'PandasStandardScaler_v0' +prep_id = "PandasStandardScaler_v0" class TestPandasStandardScalerv0(unittest.TestCase): @@ -23,65 +23,81 @@ def __init__(self, *args, **kwargs): @classmethod def setUpClass(self) -> None: - print('Setting up all tests...') + print("Setting up all tests...") @classmethod def tearDownClass(self) -> None: - print('\nHave a good day!') + print("\nHave a good day!") def setUp(self) -> None: - x1 = pd.Series(rng.normal(loc=1, scale=2, size=(100,)), name='X1') - x2 = pd.Series(rng.normal(loc=3, scale=4, size=(100,)), name='X2') - x3 = pd.Series(rng.uniform(low=3, high=10, size=(100,)), name='X3') - x4 = pd.Series(rng.uniform(low=-4, high=1, size=(100,)), name='X4') - data = pd.concat([x1,x2,x3,x4], axis=1) + x1 = pd.Series(rng.normal(loc=1, scale=2, size=(100,)), name="X1") + x2 = pd.Series(rng.normal(loc=3, scale=4, size=(100,)), name="X2") + x3 = pd.Series(rng.uniform(low=3, high=10, size=(100,)), name="X3") + x4 = pd.Series(rng.uniform(low=-4, high=1, size=(100,)), name="X4") + data = pd.concat([x1, x2, x3, x4], axis=1) self.data = data - print('\n----------------------------------------------------------------------') + print( + "\n----------------------------------------------------------------------" + ) return super().setUp() def tearDown(self) -> None: # print('\nEnd of Test') - print('----------------------------------------------------------------------') + print("----------------------------------------------------------------------") return super().tearDown() def test_output_types(self): - - prep = make(prep_id, config={'inplace': True}) + + prep = make(prep_id, config={"inplace": True}) output = prep.run(self.data) self.assertIsNone(output) - prep = make(prep_id, config={'inplace': False}) + prep = make(prep_id, config={"inplace": False}) output = prep.run(self.data) - self.assertEqual(type(output), pd.DataFrame, msg='Output not DataFrame when inplace==False') - + self.assertEqual( + type(output), pd.DataFrame, msg="Output not DataFrame when inplace==False" + ) + def test_axis_zero(self): - prep = make(prep_id, config={'axis': 0}) + prep = make(prep_id, config={"axis": 0}) scaled_data = prep.run(self.data) mean = scaled_data.mean(axis=0) var = scaled_data.var(axis=0, ddof=0) - self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') - self.assertTrue(np.allclose(var, np.ones_like(var)), msg='Column variance not equal to one') + self.assertTrue( + np.allclose(mean, np.zeros_like(mean)), msg="Column mean not equal to zero" + ) + self.assertTrue( + np.allclose(var, np.ones_like(var)), msg="Column variance not equal to one" + ) def test_inplace_run(self): - prep = make(prep_id, config={'inplace': True}) + prep = make(prep_id, config={"inplace": True}) out = prep.run(self.data) mean = self.data.mean(axis=0) var = self.data.var(axis=0, ddof=0) - self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') - self.assertTrue(np.allclose(var, np.ones_like(var)), msg='Column variance not equal to one') + self.assertTrue( + np.allclose(mean, np.zeros_like(mean)), msg="Column mean not equal to zero" + ) + self.assertTrue( + np.allclose(var, np.ones_like(var)), msg="Column variance not equal to one" + ) def test_zero_variance(self): original_shape = self.data.shape - self.data['X5'] = pd.Series(4*np.ones(shape=(100,)), name='X5') + self.data["X5"] = pd.Series(4 * np.ones(shape=(100,)), name="X5") prep = make(prep_id) scaled_data = prep.run(self.data) mean = scaled_data.mean(axis=0) var = scaled_data.var(axis=0, ddof=0) - self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') + self.assertTrue( + np.allclose(mean, np.zeros_like(mean)), msg="Column mean not equal to zero" + ) theory_var = np.ones_like(var) theory_var[-1] = 0 - self.assertTrue(np.allclose(var, theory_var), msg='Scaled variance is incorrect') + self.assertTrue( + np.allclose(var, theory_var), msg="Scaled variance is incorrect" + ) def test_multi_run(self): # Should set mean and scale only based on first dataset called with run @@ -91,23 +107,33 @@ def test_multi_run(self): saved_scale = prep.scale scaled_data2 = prep.run(self.data + 5) - self.assertTrue((saved_mean == prep.mean).all(), msg='Mean has changed after second run()') - self.assertTrue((saved_scale == prep.scale).all(), msg='Scale has changed after second run()') + self.assertTrue( + (saved_mean == prep.mean).all(), msg="Mean has changed after second run()" + ) + self.assertTrue( + (saved_scale == prep.scale).all(), + msg="Scale has changed after second run()", + ) # Mean in data+5 after scaling should be 5 / scale - self.assertTrue(np.allclose(scaled_data2.mean(), 5/prep.scale), msg='Mean of second run() is incorrect.') - + self.assertTrue( + np.allclose(scaled_data2.mean(), 5 / prep.scale), + msg="Mean of second run() is incorrect.", + ) def test_save_load(self): prep = make(prep_id) scaled_data = prep.run(self.data) - save_path = './test_saved_prep' + save_path = "./test_saved_prep" try: prep.save(save_path) new_prep = make(prep_id) new_prep.load(save_path) new_scaled_data = new_prep.run(self.data) - self.assertTrue(np.allclose(new_scaled_data, scaled_data), msg='Scaled data after load() does not match') + self.assertTrue( + np.allclose(new_scaled_data, scaled_data), + msg="Scaled data after load() does not match", + ) finally: shutil.rmtree(save_path) @@ -118,6 +144,7 @@ def test_reverse_scaling(self): self.assertTrue(np.allclose(self.data, unscaled_data)) + # Run this file via: python utest_csv_parser_v0.py if __name__ == "__main__": unittest.main() diff --git a/utests/utest_split_dataframe_v0.py b/utests/utest_split_dataframe_v0.py index 4684c7c..4e377fa 100644 --- a/utests/utest_split_dataframe_v0.py +++ b/utests/utest_split_dataframe_v0.py @@ -20,22 +20,24 @@ def test_split_by_columns(self): arrays = SplitDataFrame.split_by_columns( df=self.df, feature_columns=self.feature_columns, - target_columns=self.target_columns + target_columns=self.target_columns, ) self.assertTrue(len(arrays) == 2) self.assertTrue(np.array_equal(arrays[0], np.stack([self.a, self.b], axis=-1))) self.assertTrue(np.array_equal(arrays[1], np.stack([self.c, self.d], axis=-1))) - + def test_split_array(self): arr = np.stack([self.a, self.b], axis=-1) idxs = np.array([0, 4, 3, 8, 1, 5, 2, 7, 6, 9]) rows_fractions = [0.6, 0.2, 0.2] - arrays = SplitDataFrame.split_array(arr=arr, idxs=idxs, rows_fractions=rows_fractions) + arrays = SplitDataFrame.split_array( + arr=arr, idxs=idxs, rows_fractions=rows_fractions + ) self.assertTrue(len(arrays) == len(rows_fractions)) - self.assertTrue(np.array_equal(arrays[0], arr[idxs[: 6], :])) - self.assertTrue(np.array_equal(arrays[1], arr[idxs[6 : 8], :])) - self.assertTrue(np.array_equal(arrays[2], arr[idxs[8 :], :])) + self.assertTrue(np.array_equal(arrays[0], arr[idxs[:6], :])) + self.assertTrue(np.array_equal(arrays[1], arr[idxs[6:8], :])) + self.assertTrue(np.array_equal(arrays[2], arr[idxs[8:], :])) -if __name__ == '__main__': - unittest.main() \ No newline at end of file +if __name__ == "__main__": + unittest.main()