diff --git a/CHANGES.rst b/CHANGES.rst index b1a15ed..758b18a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,10 @@ preprocessor ------------ - explicitly pass `encoding=bytes` in transform.hypersonic_pliers for numpy 2 compatibility where this will no longer be the default for np.loadtxt [#92] +builder +------- +- Various minor fixes relating to CNN 2d model usage [#93] + 1.1.1 (2024-07-11) ================== diff --git a/spacekit/builder/architect.py b/spacekit/builder/architect.py index 8f9e0b6..a400a6b 100644 --- a/spacekit/builder/architect.py +++ b/spacekit/builder/architect.py @@ -898,7 +898,7 @@ def ensemble_cnn(self): self.cnn.output_name = "svm_image_output" self.cnn.name = "svm_cnn" self.cnn.ensemble = True - self.cnn.input_shape = self.X_train[1].shape[1:] if self.X_train else None + self.cnn.input_shape = self.X_train[1].shape[1:] if self.X_train is not None else None self.cnn.output_shape = 1 self.cnn.layers = [18, 32, 64, 32, 18] self.cnn.activation = "leaky_relu" @@ -1016,7 +1016,7 @@ def __init__( **builder_kwargs, ) self.blueprint = blueprint - self.input_shape = self.X_train.shape[1:] if self.X_train else None + self.input_shape = self.X_train.shape[1:] if self.X_train is not None else None self.output_shape = 1 self.input_name = "cnn2d_inputs" self.output_name = "cnn2d_output" @@ -1035,7 +1035,7 @@ def __init__( self.early_stopping = None self.batch_size = 32 self.cost_function = "sigmoid" - self.step_size = X_train.shape[1] if X_train else None + self.step_size = X_train.shape[1] if X_train is not None else None self.steps_per_epoch = self.step_size // self.batch_size self.batch_maker = self.batch @@ -1054,17 +1054,17 @@ def build(self): )(inputs) x = MaxPool1D(strides=self.strides)(x) x = BatchNormalization()(x) - count = 1 - for f in self.filters[1:]: + for f in list(range(len(self.filters))): + if f == 0: + continue x = Conv1D( filters=self.filters[f], kernel_size=self.kernel, activation=self.activation, )(x) x = MaxPool1D(strides=self.strides)(x) - if count < len(self.filters): + if f < len(self.filters) - 1: x = BatchNormalization()(x) - count += 1 else: x = Flatten()(x) self.log.info("DROPOUT") diff --git a/spacekit/extractor/scrape.py b/spacekit/extractor/scrape.py index 06fbdd1..d76dd17 100644 --- a/spacekit/extractor/scrape.py +++ b/spacekit/extractor/scrape.py @@ -325,10 +325,10 @@ def __init__( self.fpaths = [] def scrape(self): - """Using the key-pair values in `dataset` dictionary attribute, download the files from a github - repo and check the hash keys match before extracting. Extraction and hash-key checking is handled - externally by the `keras.utils.data_utils.get_file` method. If extraction is successful, the - archive file will be deleted. + """Using the key-pair values in `dataset` dictionary attribute, download the files from a website + (such as zenodo) and check the hash keys match before extracting. Extraction and hash-key checking + is handled externally by the `keras.utils.data_utils.get_file` method. If extraction is successful, + the archive file will be deleted. See spacekit.datasets.meta for dictionary formatting examples. Returns ------- diff --git a/spacekit/preprocessor/transform.py b/spacekit/preprocessor/transform.py index a9e6d06..d7f7732 100644 --- a/spacekit/preprocessor/transform.py +++ b/spacekit/preprocessor/transform.py @@ -869,7 +869,7 @@ def tensors_to_arrays(X_train, y_train, X_test, y_test): def hypersonic_pliers( - path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", encoding=bytes, subtract_y=0.0 + path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", encoding='bytes', subtract_y=0.0, reshape=False ): """Extracts data into 1-dimensional arrays, using separate target classes (y) for training and test data. Assumes y (target) is first column in dataframe. If the target (y) classes in the raw data are 0 and 2, but you'd like them to be binaries (0 @@ -900,16 +900,15 @@ def hypersonic_pliers( Train = np.loadtxt(path_to_train, skiprows=skip, delimiter=dlm, encoding=encoding) cols = list(range(Train.shape[1])) xcols = [c for c in cols if c not in y_col] - # X_train = Train[:, 1:] X_train = Train[:, xcols] - # y_train = Train[:, 0, np.newaxis] - subtract_y y_train = Train[:, y_col, np.newaxis] - subtract_y Test = np.loadtxt(path_to_test, skiprows=skip, delimiter=dlm, encoding=encoding) X_test = Test[:, xcols] y_test = Test[:, y_col, np.newaxis] - subtract_y - # X_test = Test[:, 1:] - # y_test = Test[:, 0, np.newaxis] - subtract_y + if reshape is True: + y_train = y_train.reshape(y_train.shape[0], 1) + y_test = y_test.reshape(y_test.shape[0], 1) del Train, Test print("X_train: ", X_train.shape) diff --git a/spacekit/skopes/kepler/light_curves.py b/spacekit/skopes/kepler/light_curves.py index ebb7163..2a9a68e 100644 --- a/spacekit/skopes/kepler/light_curves.py +++ b/spacekit/skopes/kepler/light_curves.py @@ -5,9 +5,21 @@ babel_fish_dispenser, ) from spacekit.builder.architect import BuilderCNN2D -from spacekit.datasets.k2_exo import k2_uri, k2_data +from spacekit.datasets.meta import k2 as k2meta from spacekit.extractor.scrape import WebScraper +def downloads_exist(scraper, k2_meta): + base_path = os.path.join(scraper.cache_dir, scraper.cache_subdir) + filepaths = [] + for k, v in k2_meta.items(): + fpath = os.path.join(base_path, v['key']) + filepaths.append(fpath) + for fp in filepaths: + if not os.path.exists(fp): + return [] + print("Found existing datasets, skipping download.") + return filepaths + class LaunchK2: def __init__(self, fpaths): @@ -20,54 +32,50 @@ def __init__(self, fpaths): self.history = None def launch_prep(self): - self.X_train, self.X_test, self.y_train, self.y_test = self.split_data() - self.X_train, self.X_test = self.scale_data() - self.X_train, self.X_test = self.add_filter() - return self.X_train, self.X_test, self.y_train, self.y_test + self.split_data() + self.scale_data() + self.add_filter() def split_data(self): print("Splitting train-test feature and target data...") for fpath in self.fpaths: - if fpath.endswith("Train"): + if "Train" in fpath: train = fpath else: test = fpath self.X_train, self.X_test, self.y_train, self.y_test = hypersonic_pliers( - train, test + train, test, subtract_y=1.0, reshape=True ) print("Data split successful") - return self.X_train, self.X_test, self.y_train, self.y_test def scale_data(self): print("Scaling data to Zero Mean and Unit Variance...") self.X_train, self.X_test = thermo_fusion_chisel(self.X_train, self.X_test) print("Data scaling successful.") - return self.X_train, self.X_test def add_filter(self): print("Adding noise filter...") self.X_train, self.X_test = babel_fish_dispenser(self.X_train, self.X_test) print("Noise filter added successfully.") - return self.X_train, self.X_test def deploy(self): self.builder = BuilderCNN2D( - self.X_train, self.y_train, self.X_test, self.y_test + X_train=self.X_train, y_train=self.y_train, X_test=self.X_test, y_test=self.y_test ) self.builder.build() - return self.builder def takeoff(self): self.history = self.builder.batch_fit() if __name__ == "__main__": - home = os.getcwd() - data = os.path.join(home, "data") print("Extracting data...") - fpaths = WebScraper(k2_uri, k2_data).scrape_repo() - print("Data extraction successful.") - k2 = LaunchK2(fpaths) + scraper = WebScraper(k2meta['uri'], k2meta['data']) + scraper.fpaths = downloads_exist(scraper, k2meta['data']) + if not scraper.fpaths: + scraper.scrape() + print("Data extraction successful.") + k2 = LaunchK2(scraper.fpaths) k2.launch_prep() - k2.builder = k2.deploy() - k2.history = k2.takeoff() + k2.deploy() + k2.takeoff()