diff --git a/src/triage/component/architect/builders.py b/src/triage/component/architect/builders.py index db5d028bd..3224c38f1 100644 --- a/src/triage/component/architect/builders.py +++ b/src/triage/component/architect/builders.py @@ -318,14 +318,17 @@ def build_matrix( ) output, labels = self.stitch_csvs(feature_queries, label_query, matrix_store, matrix_uuid) - logger.debug(f"matrix stitched, pandas DF returned") + logger.info(f"matrix stitched, pandas DF returned") matrix_store.metadata = matrix_metadata #labels = output.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = output, labels - matrix_store.save() + #matrix_store.save() + logger.info(f"Saving matrix metadata (yaml) for matrix {matrix_uuid}") + matrix_store.save_matrix_metadata() # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db + logging.info(f"Getting all matrix metadata for matrix {matrix_uuid}") if matrix_type == "train": lookback = matrix_metadata["max_training_history"] else: @@ -351,6 +354,7 @@ def build_matrix( matrix_metadata=matrix_metadata, built_by_experiment=self.experiment_hash ) + logger.info(f"About to save all metrix metadata on DB for matrix {matrix_uuid}") # before saving the matrix metadata we need to cast datetimes to str matrix_metadata = change_datetimes_on_metadata(matrix_metadata) session = self.sessionmaker() @@ -576,13 +580,30 @@ def stitch_csvs(self, features_queries, label_query, matrix_store, matrix_uuid): logger.debug(f"df data types: {df.dtypes}") logger.spam(f"Pandas DF memory usage: {df.memory_usage(deep=True).sum()/1000000} MB") + logger.debug(f"Generating gzip from full matrix csv") + self.generate_gzip(path_, matrix_uuid) + logger.debug(f"removing csvs files for matrix {matrix_uuid}") # addinig _sorted and _fixed files to list of files to rm rm_filenames = generate_list_of_files_to_remove(filenames, matrix_uuid) self.remove_unnecessary_files(rm_filenames, path_, matrix_uuid) return df, labels_series + + def generate_gzip(self, path, matrix_uuid): + """ + Generates a gzip from the csv file with all the features (doesn't include the label) + + Args: + path (string): _description_ + matrix_uuid (string): _description_ + """ + cmd_line = "gzip -k" + path + "/" + matrix_uuid + ".csv" + logger.debug(f"Generating gzip of full matrix on cmd line with command: {cmd_line}") + subprocess.run(cmd_line, shell=True) + logger.debug(f"Full matrix {matrix_uuid} compressed and saved!") + def remove_unnecessary_files(self, filenames, path_, matrix_uuid): """ diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py index 7af0b7221..58a2d3fd0 100644 --- a/src/triage/component/catwalk/storage.py +++ b/src/triage/component/catwalk/storage.py @@ -679,6 +679,10 @@ def _load_as_df(self): with self.matrix_base_store.open("rb") as fd: return pd.read_csv(fd, compression="gzip", parse_dates=["as_of_date"]) + def save_matrix_metadata(self): + with self.metadata_base_store.open("wb") as fd: + yaml.dump(self.metadata, fd, encoding="utf-8") + def save(self): self.matrix_base_store.write(gzip.compress(self.full_matrix_for_saving.to_csv(None).encode("utf-8"))) with self.metadata_base_store.open("wb") as fd: