Skip to content

Commit

Permalink
generating gzip with command line
Browse files Browse the repository at this point in the history
  • Loading branch information
silil committed Nov 14, 2023
1 parent a062b3f commit 014b8fb
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
25 changes: 23 additions & 2 deletions src/triage/component/architect/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,17 @@ def build_matrix(
)

output, labels = self.stitch_csvs(feature_queries, label_query, matrix_store, matrix_uuid)
logger.debug(f"matrix stitched, pandas DF returned")
logger.info(f"matrix stitched, pandas DF returned")
matrix_store.metadata = matrix_metadata
#labels = output.pop(matrix_store.label_column_name)
matrix_store.matrix_label_tuple = output, labels
matrix_store.save()
#matrix_store.save()
logger.info(f"Saving matrix metadata (yaml) for matrix {matrix_uuid}")
matrix_store.save_matrix_metadata()

# If completely archived, save its information to matrices table
# At this point, existence of matrix already tested, so no need to delete from db
logging.info(f"Getting all matrix metadata for matrix {matrix_uuid}")
if matrix_type == "train":
lookback = matrix_metadata["max_training_history"]
else:
Expand All @@ -351,6 +354,7 @@ def build_matrix(
matrix_metadata=matrix_metadata,
built_by_experiment=self.experiment_hash
)
logger.info(f"About to save all metrix metadata on DB for matrix {matrix_uuid}")
# before saving the matrix metadata we need to cast datetimes to str
matrix_metadata = change_datetimes_on_metadata(matrix_metadata)
session = self.sessionmaker()
Expand Down Expand Up @@ -576,13 +580,30 @@ def stitch_csvs(self, features_queries, label_query, matrix_store, matrix_uuid):
logger.debug(f"df data types: {df.dtypes}")
logger.spam(f"Pandas DF memory usage: {df.memory_usage(deep=True).sum()/1000000} MB")

logger.debug(f"Generating gzip from full matrix csv")
self.generate_gzip(path_, matrix_uuid)

logger.debug(f"removing csvs files for matrix {matrix_uuid}")
# addinig _sorted and _fixed files to list of files to rm
rm_filenames = generate_list_of_files_to_remove(filenames, matrix_uuid)
self.remove_unnecessary_files(rm_filenames, path_, matrix_uuid)

return df, labels_series


def generate_gzip(self, path, matrix_uuid):
"""
Generates a gzip from the csv file with all the features (doesn't include the label)
Args:
path (string): _description_
matrix_uuid (string): _description_
"""
cmd_line = "gzip -k" + path + "/" + matrix_uuid + ".csv"
logger.debug(f"Generating gzip of full matrix on cmd line with command: {cmd_line}")
subprocess.run(cmd_line, shell=True)
logger.debug(f"Full matrix {matrix_uuid} compressed and saved!")


def remove_unnecessary_files(self, filenames, path_, matrix_uuid):
"""
Expand Down
4 changes: 4 additions & 0 deletions src/triage/component/catwalk/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,10 @@ def _load_as_df(self):
with self.matrix_base_store.open("rb") as fd:
return pd.read_csv(fd, compression="gzip", parse_dates=["as_of_date"])

def save_matrix_metadata(self):
with self.metadata_base_store.open("wb") as fd:
yaml.dump(self.metadata, fd, encoding="utf-8")

def save(self):
self.matrix_base_store.write(gzip.compress(self.full_matrix_for_saving.to_csv(None).encode("utf-8")))
with self.metadata_base_store.open("wb") as fd:
Expand Down

0 comments on commit 014b8fb

Please sign in to comment.