From 4043a64a0733bcd99615b6f3d330928fb0f8e17e Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Fri, 2 Aug 2024 14:13:32 -0400 Subject: [PATCH] Refactor to have references in separate sub-directory; closes #49 (#77) * First pass at having references in separate directory * Iterating on reference directory use * Using shutil to fully remove data directory and any sub-directories * Safer use of shutil for reference directory * Minor updates * Updating documentation * Saving source JSON files to source sub-directory * Updating documentation * Further updates * Apply suggestions from code review Co-authored-by: Kelle Cruz * Print out path when saving source and reference tables --------- Co-authored-by: Kelle Cruz --- astrodbkit2/astrodb.py | 67 +++++++++++++++++++++++++------ astrodbkit2/tests/test_astrodb.py | 35 ++++++++++------ docs/index.rst | 21 ++++++---- 3 files changed, 90 insertions(+), 33 deletions(-) diff --git a/astrodbkit2/astrodb.py b/astrodbkit2/astrodb.py index c705958..ff22024 100644 --- a/astrodbkit2/astrodb.py +++ b/astrodbkit2/astrodb.py @@ -5,6 +5,7 @@ import json import os import sqlite3 +import shutil import numpy as np import pandas as pd @@ -736,8 +737,9 @@ def save_json(self, name, directory): with open(os.path.join(directory, filename), "w", encoding="utf-8") as f: f.write(json.dumps(data, indent=4, default=json_serializer)) - def save_reference_table(self, table, directory): + def save_reference_table(self, table: str, directory: str, reference_directory: str="reference"): """ + Save the reference table to disk Parameters ---------- @@ -745,16 +747,22 @@ def save_reference_table(self, table, directory): Name of reference table to output directory : str Name of directory in which to save the output JSON + reference_directory : str + Name of sub-directory to use for reference JSON files (eg, data/reference) """ + # Create directory if not already present + if not os.path.isdir(os.path.join(directory, reference_directory)): + os.makedirs(os.path.join(directory, reference_directory)) + results = self.session.query(self.metadata.tables[table]).all() data = [row._asdict() for row in results] filename = table + ".json" if len(data) > 0: - with open(os.path.join(directory, filename), "w", encoding="utf-8") as f: + with open(os.path.join(directory, reference_directory, filename), "w", encoding="utf-8") as f: f.write(json.dumps(data, indent=4, default=json_serializer)) - def save_database(self, directory, clear_first=True): + def save_database(self, directory: str, clear_first: bool=True, reference_directory: str="reference", source_directory: str="source"): """ Output contents of the database into the specified directory as JSON files. Source objects have individual JSON files with all data for that object. @@ -763,28 +771,45 @@ def save_database(self, directory, clear_first=True): Parameters ---------- directory : str - Name of directory in which to save the output JSON + Name of top-level directory in which to save the output JSON clear_first : bool First clear the directory of all existing JSON (useful to capture DB deletions). Default: True + reference_directory : str + Name of sub-directory to use for reference JSON files (eg, data/reference) + source_directory : str + Name of sub-directory to use for source JSON files (eg, data/source) """ # Clear existing files first from that directory if clear_first: print("Clearing existing JSON files...") - for filename in os.listdir(directory): - os.remove(os.path.join(directory, filename)) + for file in os.listdir(directory): + file_path = os.path.join(directory, file) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + # This is to handle the reference and source directories + shutil.rmtree(file_path) + + # Create sub-directories if not already present + if not os.path.isdir(os.path.join(directory, reference_directory)): + os.makedirs(os.path.join(directory, reference_directory)) + if not os.path.isdir(os.path.join(directory, source_directory)): + os.makedirs(os.path.join(directory, source_directory)) # Output reference tables + print(f"Storing reference tables to {os.path.join(directory, reference_directory)}...") for table in self._reference_tables: # Skip reference tables that are not actually in the database if table not in self.metadata.tables.keys(): continue - self.save_reference_table(table, directory) + self.save_reference_table(table, directory, reference_directory=reference_directory) # Output primary objects + print(f"Storing individual sources to {os.path.join(directory, source_directory)}...") for row in tqdm(self.query(self.metadata.tables[self._primary_table])): - self.save_json(row, directory) + self.save_json(row, os.path.join(directory, source_directory)) # Object input methods def add_table_data(self, data, table, fmt="csv"): @@ -892,7 +917,7 @@ def load_json(self, filename): temp_dict[self._foreign_key] = source conn.execute(self.metadata.tables[key].insert().values(temp_dict)) - def load_database(self, directory, verbose=False): + def load_database(self, directory: str, verbose: bool=False, reference_directory: str="reference", source_directory: str="source"): """ Reload entire database from a directory of JSON files. Note that this will first clear existing tables. @@ -900,9 +925,13 @@ def load_database(self, directory, verbose=False): Parameters ---------- directory : str - Name of directory containing the JSON files + Name of top-level directory containing the JSON files verbose : bool Flag to enable diagnostic messages + reference_directory : str + Relative path to sub-directory to use for reference JSON files (eg, data/reference) + source_directory : str + Relative path to sub-directory to use for source JSON files (eg, data/source) """ # Clear existing database contents @@ -917,12 +946,24 @@ def load_database(self, directory, verbose=False): for table in self._reference_tables: if verbose: print(f"Loading {table} table") - self.load_table(table, directory, verbose=verbose) + # Check if the reference table is in the sub-directory + if os.path.exists(os.path.join(directory, reference_directory, table+".json")): + self.load_table(table, os.path.join(directory, reference_directory), verbose=verbose) + else: + self.load_table(table, directory, verbose=verbose) # Load object data if verbose: print("Loading object tables") - for file in tqdm(os.listdir(directory)): + + # Check if the sources are in the sub-directory + if os.path.exists(os.path.join(directory, source_directory)): + directory_of_sources = os.path.join(directory, source_directory) + else: + directory_of_sources = directory + + # Scan selected directory for JSON source files + for file in tqdm(os.listdir(directory_of_sources)): # Skip reference tables core_name = file.replace(".json", "") if core_name in self._reference_tables: @@ -932,7 +973,7 @@ def load_database(self, directory, verbose=False): if not file.endswith(".json") or file.startswith("."): continue - self.load_json(os.path.join(directory, file)) + self.load_json(os.path.join(directory_of_sources, file)) def dump_sqlite(self, database_name): """Output database as a sqlite file""" diff --git a/astrodbkit2/tests/test_astrodb.py b/astrodbkit2/tests/test_astrodb.py index 40cf76a..bfa75ca 100644 --- a/astrodbkit2/tests/test_astrodb.py +++ b/astrodbkit2/tests/test_astrodb.py @@ -3,6 +3,7 @@ import io import json import os +import shutil import pandas as pd import pytest @@ -413,31 +414,35 @@ def test_views(db): def test_save_reference_table(db, db_dir): # Test saving a reference table - if os.path.exists(os.path.join(db_dir, 'Publications.json')): - os.remove(os.path.join(db_dir, 'Publications.json')) - db.save_reference_table('Publications', db_dir) - assert os.path.exists(os.path.join(db_dir, 'Publications.json')) - os.remove(os.path.join(db_dir, 'Publications.json')) # explicitly removing so that the next step will get verified + ref_dir = "reference" + if os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json')): + os.remove(os.path.join(db_dir, ref_dir, 'Publications.json')) + db.save_reference_table('Publications', db_dir, reference_directory=ref_dir) + assert os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json')) + os.remove(os.path.join(db_dir, ref_dir, 'Publications.json')) # explicitly removing so that the next step will get verified def test_save_database(db, db_dir): # Test saving the database to JSON files # Clear temporary directory first - # if not os.path.exists(DB_DIR): - # os.mkdir(DB_DIR) for file in os.listdir(db_dir): - os.remove(os.path.join(db_dir, file)) + file_path = os.path.join(db_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) db.save_database(db_dir) # Check JSON data - assert os.path.exists(os.path.join(db_dir, 'Publications.json')) - assert os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398.json')) + assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json')) + assert os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json')) assert not os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398 2.json')) + assert not os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398 2.json')) # Load source and confirm it is the same - with open(os.path.join(db_dir, '2mass_j13571237+1428398.json'), 'r') as f: + with open(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'), 'r') as f: data = json.load(f) assert data == db.inventory('2MASS J13571237+1428398') @@ -457,7 +462,7 @@ def test_load_database(db, db_dir): # Reload the database and check DB contents assert os.path.exists(db_dir) - assert os.path.exists(os.path.join(db_dir, 'Publications.json')) + assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json')) db.load_database(db_dir, verbose=True) assert db.query(db.Publications).count() == 2 assert db.query(db.Photometry).count() == 3 @@ -466,7 +471,11 @@ def test_load_database(db, db_dir): # Clear temporary directory and files for file in os.listdir(db_dir): - os.remove(os.path.join(db_dir, file)) + file_path = os.path.join(db_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) def test_copy_database_schema(): diff --git a/docs/index.rst b/docs/index.rst index 81037a3..f4981a4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -83,15 +83,18 @@ Loading the Database -------------------- **Astrodbkit2** contains methods to output the full contents of the database as a list of JSON files. -It can likewise read in a directory of these files to populate the database. -This is how SIMPLE is currently version controlled. To load a database of this form, do the following:: +It can likewise read in a directory of these files to populate the database. +By default, reference tables (eg, Publications, Telescopes, etc) and source tables are respectively stored in `reference/` and `source/` sub-directories of `data/`. +This is how SIMPLE is currently version controlled. + +To load a database of this form, do the following:: from astrodbkit2.astrodb import Database connection_string = 'sqlite:///SIMPLE.db' # SQLite connection string db_dir = 'data' # directory where JSON files are located db = Database(connection_string) - db.load_database(db_dir) + db.load_database(directory=db_dir, reference_directory="reference") .. note:: Database contents are cleared when loading from JSON files to ensure that the database only contains sources from on-disk files. We describe later how to use the :py:meth:`~astrodbkit2.astrodb.Database.save_db` method @@ -406,8 +409,12 @@ Saving the Database =================== If users perform changes to a database, they will want to output this to disk to be version controlled. -**Astrodbkit2** provides methods to save an individual source or reference table as well as the entire data. -We recommend the later to output the entire contents to disk:: +**Astrodbkit2** provides methods to save an individual source or reference table as well as all of the data stored in the database. +By default, reference tables are stored in a sub-directory of `data/` called "reference"; this can be overwritten by +supplying a `reference_directory` variable into `save_database` or `save_reference_table`. +Similarly, source/object tables are stored in a sub-directory of `data/` called "source" which can be overwritten by supplying a `source_directory` variable. + +We recommend using `save_database` as that outputs the entire database contents to disk:: # Save single object db.save_json('2MASS J13571237+1428398', 'data') @@ -415,8 +422,8 @@ We recommend the later to output the entire contents to disk:: # Save single reference table db.save_reference_table('Publications', 'data') - # Save entire database to directory 'data' - db.save_database('data') + # Save entire database to directory 'data/' with 'reference/' and 'source/' subdirectories. + db.save_database(directory='data', reference_directory='reference', source_directory='source') .. note:: To properly capture database deletes, the contents of the specified directory is first cleared before creating JSON files representing the current state of the database.