From 3ef9c543d842e01ce5fbe1f1976838d132dc043b Mon Sep 17 00:00:00 2001
From: Brian Healy <42810347+bfhealy@users.noreply.github.com>
Date: Fri, 29 Sep 2023 09:50:16 -0500
Subject: [PATCH] Allow more path customization for feature
 generation/inference (#493)

* Add path_to_features to config

* Extend path_to_preds and path_to_features to inference code

* Update GCN cronjob

* Update generate_features.py

* Update config default for ids_skipgaia file

* Remove code.interact()
---
 config.defaults.yaml                      |  4 +++-
 gcn_cronjob.py                            |  6 +++---
 tools/generate_features.py                | 16 +++++++++++----
 tools/generate_features_job_submission.py |  3 +++
 tools/generate_features_slurm.py          |  4 ++++
 tools/inference.py                        | 25 +++++++++++++++--------
 tools/run_inference_job_submission.py     | 11 +++++++---
 tools/run_inference_slurm.py              |  6 +++++-
 8 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/config.defaults.yaml b/config.defaults.yaml
index bf2aca3c..81e8c2a1 100644
--- a/config.defaults.yaml
+++ b/config.defaults.yaml
@@ -1358,13 +1358,15 @@ feature_stats:
 
 # Below, specify algorithms, external catalogs and features to include in generated feature lists:
 feature_generation:
+  # Path to save generated features
+  path_to_features:
   # If --doSpecificIDs is set in generate_features.py, the script will generate features for the below dataset instead of a field/ccd/quad.
   # Dataset must contain columns named "ztf_id" and "coordinates" with data in the format of these fields on Kowalski
   # Default dataset is the training set downloadable from Fritz
   dataset: tools/fritzDownload/merged_classifications_features.parquet
   # Once generate_features.py is run with --doSpecificIDs, a file will be saved with the default name below.
   # Set --skipCloseSources to load the file below and skip the idenfication of close sources:
-  ids_skipGaia: tools/fritzDownload/specific_ids_dropCloseSources.json
+  ids_skipGaia: specific_ids_dropCloseSources.json
   period_algorithms:
     CPU:
       - LS
diff --git a/gcn_cronjob.py b/gcn_cronjob.py
index 7486c5b3..3c89cb69 100755
--- a/gcn_cronjob.py
+++ b/gcn_cronjob.py
@@ -201,18 +201,18 @@ def query_gcn_events(
             if (not features_file.exists()) | (has_new_sources):
                 print("Generating features on Expanse...")
                 os.system(
-                    f"scp {filepath} {username}@login.expanse.sdsc.edu:/home/{username}/scope/tools/fritzDownload/."
+                    f"scp {filepath} {username}@login.expanse.sdsc.edu:/expanse/lustre/projects/umn131/{username}/{generated_features_dirname}/fg_sources/."
                 )
                 os.system(
                     f'ssh -tt {username}@login.expanse.sdsc.edu \
                     "source .bash_profile && \
-                    cd scope/{generated_features_dirname}/slurm && \
+                    cd /expanse/lustre/projects/umn131/{username}/{generated_features_dirname}/slurm && \
                     sbatch --wait --export=DOBS={save_dateobs},DS={filepath.name} {partition}_slurm.sub"'
                 )
                 print("Finished generating features on Expanse.")
 
                 os.system(
-                    f"rsync -avh {username}@login.expanse.sdsc.edu:/home/{username}/scope/{generated_features_dirname} {BASE_DIR}/."
+                    f"rsync -avh {username}@login.expanse.sdsc.edu:/expanse/lustre/projects/umn131/{username}/{generated_features_dirname} {BASE_DIR}/."
                 )
 
             if features_file.exists():
diff --git a/tools/generate_features.py b/tools/generate_features.py
index 633a56f6..85663a16 100755
--- a/tools/generate_features.py
+++ b/tools/generate_features.py
@@ -73,6 +73,10 @@
 ext_catalog_info = config['feature_generation']['external_catalog_features']
 cesium_feature_list = config['feature_generation']['cesium_features']
 period_algorithms = config['feature_generation']['period_algorithms']
+path_to_features = config['feature_generation']['path_to_features']
+
+if path_to_features is not None:
+    BASE_DIR = pathlib.Path(path_to_features)
 
 kowalski_instances = Kowalski(timeout=timeout, instances=instances)
 
@@ -87,7 +91,8 @@ def drop_close_bright_stars(
     limit: int = 10000,
     Ncore: int = 8,
     save: bool = False,
-    save_filename: str = 'tools/fritzDownload/specific_ids_dropCloseSources.json',
+    save_directory: str = 'generated_features',
+    save_filename: str = 'specific_ids_dropCloseSources.json',
 ):
     """
     Use Gaia to identify and drop sources that are too close to bright stars
@@ -103,7 +108,8 @@ def drop_close_bright_stars(
     :param limit: if doSpecificIDs is set, max number of sources to be queries in one batch (int)
     :param Ncore: if doSpecificIDs is set, number of cores over which to parallelize queries (int)
     :param save: if set, save sources passing the close source analysis (bool)
-    :param save_filename: path/name from BASE_DIR to save sources (str)
+    :param save_directory: directory within BASE_DIR to save sources (str)
+    :param save_filename: filename to use when saving sources (str)
 
     :return id_dct_keep: dictionary containing subset of input sources far enough away from bright stars
     """
@@ -383,7 +389,8 @@ def drop_close_bright_stars(
         id_dct_keep = id_dct
 
     if save:
-        with open(str(BASE_DIR / save_filename), 'w') as f:
+        os.makedirs(BASE_DIR / save_directory, exist_ok=True)
+        with open(str(BASE_DIR / save_directory / save_filename), 'w') as f:
             json.dump(id_dct_keep, f)
 
     print(f"Dropped {len(id_dct) - len(id_dct_keep)} sources.")
@@ -591,7 +598,7 @@ def generate_features(
         else:
             # Load pre-saved dataset if Gaia analysis already complete
             fg_sources_config = config['feature_generation']['ids_skipGaia']
-            fg_sources_path = str(BASE_DIR / fg_sources_config)
+            fg_sources_path = str(BASE_DIR / dirname / fg_sources_config)
 
             if fg_sources_path.endswith('.json'):
                 with open(fg_sources_path, 'r') as f:
@@ -643,6 +650,7 @@ def generate_features(
                 limit=limit,
                 Ncore=Ncore,
                 save=not doNotSave,
+                save_directory=dirname,
             )
 
         else:
diff --git a/tools/generate_features_job_submission.py b/tools/generate_features_job_submission.py
index da818515..7ad6a507 100755
--- a/tools/generate_features_job_submission.py
+++ b/tools/generate_features_job_submission.py
@@ -16,6 +16,9 @@
     config = yaml.load(config_yaml, Loader=yaml.FullLoader)
 
 fields_to_run = config['feature_generation']['fields_to_run']
+path_to_features = config['feature_generation']['path_to_features']
+if path_to_features is not None:
+    BASE_DIR = pathlib.Path(path_to_features)
 
 
 def parse_commandline():
diff --git a/tools/generate_features_slurm.py b/tools/generate_features_slurm.py
index 2be6ef8a..e66970e3 100755
--- a/tools/generate_features_slurm.py
+++ b/tools/generate_features_slurm.py
@@ -52,6 +52,10 @@
 gaia_catalog = config['kowalski']['collections']['gaia']
 ext_catalog_info = config['feature_generation']['external_catalog_features']
 cesium_feature_list = config['feature_generation']['cesium_features']
+path_to_features = config['feature_generation']['path_to_features']
+
+if path_to_features is not None:
+    BASE_DIR = pathlib.Path(path_to_features)
 
 
 def check_quads_for_sources(
diff --git a/tools/inference.py b/tools/inference.py
index d5546a03..e6d2f1be 100755
--- a/tools/inference.py
+++ b/tools/inference.py
@@ -26,13 +26,22 @@
 warnings.filterwarnings('ignore')
 
 BASE_DIR = pathlib.Path(__file__).parent.parent.absolute()
+BASE_DIR_FEATS = pathlib.Path(__file__).parent.parent.absolute()
+BASE_DIR_PREDS = pathlib.Path(__file__).parent.parent.absolute()
 JUST = 50
 
-
 config_path = BASE_DIR / "config.yaml"
 with open(config_path) as config_yaml:
     config = yaml.load(config_yaml, Loader=yaml.FullLoader)
 
+path_to_features = config['feature_generation']['path_to_features']
+path_to_preds = config['inference']['path_to_preds']
+
+if path_to_features is not None:
+    BASE_DIR_FEATS = pathlib.Path(path_to_features)
+if path_to_preds is not None:
+    BASE_DIR_PREDS = pathlib.Path(path_to_preds)
+
 period_suffix_config = config['features']['info']['period_suffix']
 
 # Load training set
@@ -114,7 +123,7 @@ def clean_data(
     # file to store flagged ids and features with missing values
     if not whole_field:
         filename = (
-            str(BASE_DIR)
+            str(BASE_DIR_PREDS)
             + f"/preds_{algorithm}/field_"
             + str(field)
             + "/ccd_"
@@ -125,7 +134,7 @@ def clean_data(
         )
     else:
         filename = (
-            str(BASE_DIR)
+            str(BASE_DIR_PREDS)
             + f"/preds_{algorithm}/field_"
             + str(field)
             + f"/field_{field}_flagged.json"
@@ -263,19 +272,19 @@ def run_inference(
     if not int_field:
         if 'specific_ids' in field:
             default_features_file = str(
-                BASE_DIR
+                BASE_DIR_FEATS
                 / f"{feature_directory}/specific_ids/gen_gcn_features_{field}.parquet"
             )
     else:
         # default file location for source ids
         if whole_field:
             default_features_file = (
-                str(BASE_DIR) + f"/{feature_directory}/field_" + str(field)
+                str(BASE_DIR_FEATS) + f"/{feature_directory}/field_" + str(field)
             )
         else:
             if feature_directory == 'features':
                 default_features_file = (
-                    str(BASE_DIR)
+                    str(BASE_DIR_FEATS)
                     + f"/{feature_directory}/field_"
                     + str(field)
                     + "/ccd_"
@@ -286,7 +295,7 @@ def run_inference(
                 )
             else:
                 default_features_file = (
-                    str(BASE_DIR)
+                    str(BASE_DIR_FEATS)
                     + f"/{feature_directory}/field_"
                     + str(field)
                     + f"/{feature_file_prefix}_"
@@ -302,7 +311,7 @@ def run_inference(
     features_filename = kwargs.get("features_filename", default_features_file)
 
     out_dir = os.path.join(
-        os.path.dirname(__file__), f"{str(BASE_DIR)}/preds_{algorithm}/"
+        os.path.dirname(__file__), f"{str(BASE_DIR_PREDS)}/preds_{algorithm}/"
     )
 
     if not whole_field:
diff --git a/tools/run_inference_job_submission.py b/tools/run_inference_job_submission.py
index fbcb2d5a..438d9516 100755
--- a/tools/run_inference_job_submission.py
+++ b/tools/run_inference_job_submission.py
@@ -6,12 +6,17 @@
 import numpy as np
 
 BASE_DIR = pathlib.Path(__file__).parent.parent.absolute()
+BASE_DIR_PREDS = pathlib.Path(__file__).parent.parent.absolute()
 
 # Read config file
 config_path = BASE_DIR / "config.yaml"
 with open(config_path) as config_yaml:
     config = yaml.load(config_yaml, Loader=yaml.FullLoader)
 
+path_to_preds = config['inference']['path_to_preds']
+if path_to_preds is not None:
+    BASE_DIR_PREDS = pathlib.Path(path_to_preds)
+
 
 def parse_commandline():
     """
@@ -23,7 +28,7 @@ def parse_commandline():
         "--dirname",
         type=str,
         default='inference',
-        help="Directory name for training slurm scripts",
+        help="Directory name for inference slurm scripts",
     )
     parser.add_argument(
         "-f", "--filetype", default="slurm", help="Type of job submission file"
@@ -50,7 +55,7 @@ def filter_completed(fields, algorithm):
     fields_copy = fields.copy()
 
     for field in fields:
-        searchDir = BASE_DIR / f'preds_{algorithm}' / f'field_{field}'
+        searchDir = BASE_DIR_PREDS / f'preds_{algorithm}' / f'field_{field}'
         searchDir.mkdir(parents=True, exist_ok=True)
         generator = searchDir.iterdir()
         has_parquet = np.sum([x.suffix == '.parquet' for x in generator]) > 0
@@ -78,7 +83,7 @@ def run_job(field):
     filetype = args.filetype
     dirname = args.dirname
 
-    slurmDir = str(BASE_DIR / dirname)
+    slurmDir = str(BASE_DIR_PREDS / dirname)
 
     fields = config['inference']['fields_to_run']
     algorithm = args.algorithm
diff --git a/tools/run_inference_slurm.py b/tools/run_inference_slurm.py
index edc12b38..c5925c05 100755
--- a/tools/run_inference_slurm.py
+++ b/tools/run_inference_slurm.py
@@ -6,11 +6,15 @@
 
 
 BASE_DIR = pathlib.Path(__file__).parent.parent.absolute()
+BASE_DIR_PREDS = pathlib.Path(__file__).parent.parent.absolute()
 
 config_path = BASE_DIR / "config.yaml"
 with open(config_path) as config_yaml:
     config = yaml.load(config_yaml, Loader=yaml.FullLoader)
 
+path_to_preds = config['inference']['path_to_preds']
+if path_to_preds is not None:
+    BASE_DIR_PREDS = pathlib.Path(path_to_preds)
 
 if __name__ == "__main__":
 
@@ -141,7 +145,7 @@
     dirname = f"{algorithm}_{args.dirname}"
     jobname = f"{args.job_name}_{algorithm}"
 
-    dirpath = BASE_DIR / dirname
+    dirpath = BASE_DIR_PREDS / dirname
     os.makedirs(dirpath, exist_ok=True)
 
     slurmDir = os.path.join(dirpath, 'slurm')