From 4043a64a0733bcd99615b6f3d330928fb0f8e17e Mon Sep 17 00:00:00 2001
From: David Rodriguez <dr-rodriguez@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:13:32 -0400
Subject: [PATCH] Refactor to have references in separate sub-directory; closes
 #49 (#77)

* First pass at having references in separate directory

* Iterating on reference directory use

* Using shutil to fully remove data directory and any sub-directories

* Safer use of shutil for reference directory

* Minor updates

* Updating documentation

* Saving source JSON files to source sub-directory

* Updating documentation

* Further updates

* Apply suggestions from code review

Co-authored-by: Kelle Cruz <kellecruz@gmail.com>

* Print out path when saving source and reference tables

---------

Co-authored-by: Kelle Cruz <kellecruz@gmail.com>
---
 astrodbkit2/astrodb.py            | 67 +++++++++++++++++++++++++------
 astrodbkit2/tests/test_astrodb.py | 35 ++++++++++------
 docs/index.rst                    | 21 ++++++----
 3 files changed, 90 insertions(+), 33 deletions(-)

diff --git a/astrodbkit2/astrodb.py b/astrodbkit2/astrodb.py
index c705958..ff22024 100644
--- a/astrodbkit2/astrodb.py
+++ b/astrodbkit2/astrodb.py
@@ -5,6 +5,7 @@
 import json
 import os
 import sqlite3
+import shutil
 
 import numpy as np
 import pandas as pd
@@ -736,8 +737,9 @@ def save_json(self, name, directory):
         with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
             f.write(json.dumps(data, indent=4, default=json_serializer))
 
-    def save_reference_table(self, table, directory):
+    def save_reference_table(self, table: str, directory: str, reference_directory: str="reference"):
         """
+        Save the reference table to disk
 
         Parameters
         ----------
@@ -745,16 +747,22 @@ def save_reference_table(self, table, directory):
             Name of reference table to output
         directory : str
             Name of directory in which to save the output JSON
+        reference_directory : str
+            Name of sub-directory to use for reference JSON files (eg, data/reference)
         """
 
+        # Create directory if not already present
+        if not os.path.isdir(os.path.join(directory, reference_directory)):
+            os.makedirs(os.path.join(directory, reference_directory))
+
         results = self.session.query(self.metadata.tables[table]).all()
         data = [row._asdict() for row in results]
         filename = table + ".json"
         if len(data) > 0:
-            with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
+            with open(os.path.join(directory, reference_directory, filename), "w", encoding="utf-8") as f:
                 f.write(json.dumps(data, indent=4, default=json_serializer))
 
-    def save_database(self, directory, clear_first=True):
+    def save_database(self, directory: str, clear_first: bool=True, reference_directory: str="reference", source_directory: str="source"):
         """
         Output contents of the database into the specified directory as JSON files.
         Source objects have individual JSON files with all data for that object.
@@ -763,28 +771,45 @@ def save_database(self, directory, clear_first=True):
         Parameters
         ----------
         directory : str
-            Name of directory in which to save the output JSON
+            Name of top-level directory in which to save the output JSON
         clear_first : bool
             First clear the directory of all existing JSON (useful to capture DB deletions). Default: True
+        reference_directory : str
+            Name of sub-directory to use for reference JSON files (eg, data/reference)
+        source_directory : str
+            Name of sub-directory to use for source JSON files (eg, data/source)
         """
 
         # Clear existing files first from that directory
         if clear_first:
             print("Clearing existing JSON files...")
-            for filename in os.listdir(directory):
-                os.remove(os.path.join(directory, filename))
+            for file in os.listdir(directory):
+                file_path = os.path.join(directory, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+                elif os.path.isdir(file_path):
+                    # This is to handle the reference and source directories
+                    shutil.rmtree(file_path)
+        
+        # Create sub-directories if not already present
+        if not os.path.isdir(os.path.join(directory, reference_directory)):
+            os.makedirs(os.path.join(directory, reference_directory))
+        if not os.path.isdir(os.path.join(directory, source_directory)):
+            os.makedirs(os.path.join(directory, source_directory))
 
         # Output reference tables
+        print(f"Storing reference tables to {os.path.join(directory, reference_directory)}...")
         for table in self._reference_tables:
             # Skip reference tables that are not actually in the database
             if table not in self.metadata.tables.keys():
                 continue
 
-            self.save_reference_table(table, directory)
+            self.save_reference_table(table, directory, reference_directory=reference_directory)
 
         # Output primary objects
+        print(f"Storing individual sources to {os.path.join(directory, source_directory)}...")
         for row in tqdm(self.query(self.metadata.tables[self._primary_table])):
-            self.save_json(row, directory)
+            self.save_json(row, os.path.join(directory, source_directory))
 
     # Object input methods
     def add_table_data(self, data, table, fmt="csv"):
@@ -892,7 +917,7 @@ def load_json(self, filename):
                     temp_dict[self._foreign_key] = source
                     conn.execute(self.metadata.tables[key].insert().values(temp_dict))
 
-    def load_database(self, directory, verbose=False):
+    def load_database(self, directory: str, verbose: bool=False, reference_directory: str="reference", source_directory: str="source"):
         """
         Reload entire database from a directory of JSON files.
         Note that this will first clear existing tables.
@@ -900,9 +925,13 @@ def load_database(self, directory, verbose=False):
         Parameters
         ----------
         directory : str
-            Name of directory containing the JSON files
+            Name of top-level directory containing the JSON files
         verbose : bool
             Flag to enable diagnostic messages
+        reference_directory : str
+            Relative path to sub-directory to use for reference JSON files (eg, data/reference)
+        source_directory : str
+            Relative path to sub-directory to use for source JSON files (eg, data/source)
         """
 
         # Clear existing database contents
@@ -917,12 +946,24 @@ def load_database(self, directory, verbose=False):
         for table in self._reference_tables:
             if verbose:
                 print(f"Loading {table} table")
-            self.load_table(table, directory, verbose=verbose)
+            # Check if the reference table is in the sub-directory
+            if os.path.exists(os.path.join(directory, reference_directory, table+".json")):
+                self.load_table(table, os.path.join(directory, reference_directory), verbose=verbose)
+            else:
+                self.load_table(table, directory, verbose=verbose)
 
         # Load object data
         if verbose:
             print("Loading object tables")
-        for file in tqdm(os.listdir(directory)):
+
+        # Check if the sources are in the sub-directory
+        if os.path.exists(os.path.join(directory, source_directory)):
+            directory_of_sources = os.path.join(directory, source_directory)
+        else:
+            directory_of_sources = directory
+
+        # Scan selected directory for JSON source files
+        for file in tqdm(os.listdir(directory_of_sources)):
             # Skip reference tables
             core_name = file.replace(".json", "")
             if core_name in self._reference_tables:
@@ -932,7 +973,7 @@ def load_database(self, directory, verbose=False):
             if not file.endswith(".json") or file.startswith("."):
                 continue
 
-            self.load_json(os.path.join(directory, file))
+            self.load_json(os.path.join(directory_of_sources, file))
 
     def dump_sqlite(self, database_name):
         """Output database as a sqlite file"""
diff --git a/astrodbkit2/tests/test_astrodb.py b/astrodbkit2/tests/test_astrodb.py
index 40cf76a..bfa75ca 100644
--- a/astrodbkit2/tests/test_astrodb.py
+++ b/astrodbkit2/tests/test_astrodb.py
@@ -3,6 +3,7 @@
 import io
 import json
 import os
+import shutil
 
 import pandas as pd
 import pytest
@@ -413,31 +414,35 @@ def test_views(db):
 
 def test_save_reference_table(db, db_dir):
     # Test saving a reference table
-    if os.path.exists(os.path.join(db_dir, 'Publications.json')):
-        os.remove(os.path.join(db_dir, 'Publications.json'))
-    db.save_reference_table('Publications', db_dir)
-    assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
-    os.remove(os.path.join(db_dir, 'Publications.json'))  # explicitly removing so that the next step will get verified
+    ref_dir = "reference"
+    if os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json')):
+        os.remove(os.path.join(db_dir, ref_dir, 'Publications.json'))
+    db.save_reference_table('Publications', db_dir, reference_directory=ref_dir)
+    assert os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json'))
+    os.remove(os.path.join(db_dir, ref_dir, 'Publications.json'))  # explicitly removing so that the next step will get verified
 
 
 def test_save_database(db, db_dir):
     # Test saving the database to JSON files
 
     # Clear temporary directory first
-    # if not os.path.exists(DB_DIR):
-    #     os.mkdir(DB_DIR)
     for file in os.listdir(db_dir):
-        os.remove(os.path.join(db_dir, file))
+        file_path = os.path.join(db_dir, file)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+        elif os.path.isdir(file_path):
+            shutil.rmtree(file_path)
 
     db.save_database(db_dir)
 
     # Check JSON data
-    assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
-    assert os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398.json'))
+    assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
+    assert os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'))
     assert not os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398 2.json'))
+    assert not os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398 2.json'))
 
     # Load source and confirm it is the same
-    with open(os.path.join(db_dir, '2mass_j13571237+1428398.json'), 'r') as f:
+    with open(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'), 'r') as f:
         data = json.load(f)
     assert data == db.inventory('2MASS J13571237+1428398')
 
@@ -457,7 +462,7 @@ def test_load_database(db, db_dir):
 
     # Reload the database and check DB contents
     assert os.path.exists(db_dir)
-    assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
+    assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
     db.load_database(db_dir, verbose=True)
     assert db.query(db.Publications).count() == 2
     assert db.query(db.Photometry).count() == 3
@@ -466,7 +471,11 @@ def test_load_database(db, db_dir):
 
     # Clear temporary directory and files
     for file in os.listdir(db_dir):
-        os.remove(os.path.join(db_dir, file))
+        file_path = os.path.join(db_dir, file)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+        elif os.path.isdir(file_path):
+            shutil.rmtree(file_path)
 
 
 def test_copy_database_schema():
diff --git a/docs/index.rst b/docs/index.rst
index 81037a3..f4981a4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -83,15 +83,18 @@ Loading the Database
 --------------------
 
 **Astrodbkit2** contains methods to output the full contents of the database as a list of JSON files.
-It can likewise read in a directory of these files to populate the database.
-This is how SIMPLE is currently version controlled. To load a database of this form, do the following::
+It can likewise read in a directory of these files to populate the database. 
+By default, reference tables (eg, Publications, Telescopes, etc) and source tables are respectively stored in `reference/` and `source/` sub-directories of `data/`.
+This is how SIMPLE is currently version controlled. 
+
+To load a database of this form, do the following::
 
     from astrodbkit2.astrodb import Database
 
     connection_string = 'sqlite:///SIMPLE.db'  # SQLite connection string
     db_dir = 'data'  # directory where JSON files are located
     db = Database(connection_string)
-    db.load_database(db_dir)
+    db.load_database(directory=db_dir, reference_directory="reference")
 
 .. note:: Database contents are cleared when loading from JSON files to ensure that the database only contains
           sources from on-disk files. We describe later how to use the :py:meth:`~astrodbkit2.astrodb.Database.save_db` method
@@ -406,8 +409,12 @@ Saving the Database
 ===================
 
 If users perform changes to a database, they will want to output this to disk to be version controlled.
-**Astrodbkit2** provides methods to save an individual source or reference table as well as the entire data.
-We recommend the later to output the entire contents to disk::
+**Astrodbkit2** provides methods to save an individual source or reference table as well as all of the data stored in the database. 
+By default, reference tables are stored in a sub-directory of `data/` called "reference"; this can be overwritten by 
+supplying a `reference_directory` variable into `save_database` or `save_reference_table`. 
+Similarly, source/object tables are stored in a sub-directory of `data/` called "source" which can be overwritten by supplying  a `source_directory` variable.
+
+We recommend using `save_database` as that outputs the entire database contents to disk::
 
     # Save single object
     db.save_json('2MASS J13571237+1428398', 'data')
@@ -415,8 +422,8 @@ We recommend the later to output the entire contents to disk::
     # Save single reference table
     db.save_reference_table('Publications', 'data')
 
-    # Save entire database to directory 'data'
-    db.save_database('data')
+    # Save entire database to directory 'data/' with 'reference/' and 'source/' subdirectories.
+    db.save_database(directory='data', reference_directory='reference', source_directory='source')
 
 .. note:: To properly capture database deletes, the contents of the specified directory is first cleared before
           creating JSON files representing the current state of the database.