Skip to content

Commit

Permalink
Refactor to have references in separate sub-directory; closes #49 (#77)
Browse files Browse the repository at this point in the history
* First pass at having references in separate directory

* Iterating on reference directory use

* Using shutil to fully remove data directory and any sub-directories

* Safer use of shutil for reference directory

* Minor updates

* Updating documentation

* Saving source JSON files to source sub-directory

* Updating documentation

* Further updates

* Apply suggestions from code review

Co-authored-by: Kelle Cruz <[email protected]>

* Print out path when saving source and reference tables

---------

Co-authored-by: Kelle Cruz <[email protected]>
  • Loading branch information
dr-rodriguez and kelle authored Aug 2, 2024
1 parent a178170 commit 4043a64
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 33 deletions.
67 changes: 54 additions & 13 deletions astrodbkit2/astrodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import os
import sqlite3
import shutil

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -736,25 +737,32 @@ def save_json(self, name, directory):
with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
f.write(json.dumps(data, indent=4, default=json_serializer))

def save_reference_table(self, table, directory):
def save_reference_table(self, table: str, directory: str, reference_directory: str="reference"):
"""
Save the reference table to disk
Parameters
----------
table : str
Name of reference table to output
directory : str
Name of directory in which to save the output JSON
reference_directory : str
Name of sub-directory to use for reference JSON files (eg, data/reference)
"""

# Create directory if not already present
if not os.path.isdir(os.path.join(directory, reference_directory)):
os.makedirs(os.path.join(directory, reference_directory))

results = self.session.query(self.metadata.tables[table]).all()
data = [row._asdict() for row in results]
filename = table + ".json"
if len(data) > 0:
with open(os.path.join(directory, filename), "w", encoding="utf-8") as f:
with open(os.path.join(directory, reference_directory, filename), "w", encoding="utf-8") as f:
f.write(json.dumps(data, indent=4, default=json_serializer))

def save_database(self, directory, clear_first=True):
def save_database(self, directory: str, clear_first: bool=True, reference_directory: str="reference", source_directory: str="source"):
"""
Output contents of the database into the specified directory as JSON files.
Source objects have individual JSON files with all data for that object.
Expand All @@ -763,28 +771,45 @@ def save_database(self, directory, clear_first=True):
Parameters
----------
directory : str
Name of directory in which to save the output JSON
Name of top-level directory in which to save the output JSON
clear_first : bool
First clear the directory of all existing JSON (useful to capture DB deletions). Default: True
reference_directory : str
Name of sub-directory to use for reference JSON files (eg, data/reference)
source_directory : str
Name of sub-directory to use for source JSON files (eg, data/source)
"""

# Clear existing files first from that directory
if clear_first:
print("Clearing existing JSON files...")
for filename in os.listdir(directory):
os.remove(os.path.join(directory, filename))
for file in os.listdir(directory):
file_path = os.path.join(directory, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
# This is to handle the reference and source directories
shutil.rmtree(file_path)

# Create sub-directories if not already present
if not os.path.isdir(os.path.join(directory, reference_directory)):
os.makedirs(os.path.join(directory, reference_directory))
if not os.path.isdir(os.path.join(directory, source_directory)):
os.makedirs(os.path.join(directory, source_directory))

# Output reference tables
print(f"Storing reference tables to {os.path.join(directory, reference_directory)}...")
for table in self._reference_tables:
# Skip reference tables that are not actually in the database
if table not in self.metadata.tables.keys():
continue

self.save_reference_table(table, directory)
self.save_reference_table(table, directory, reference_directory=reference_directory)

# Output primary objects
print(f"Storing individual sources to {os.path.join(directory, source_directory)}...")
for row in tqdm(self.query(self.metadata.tables[self._primary_table])):
self.save_json(row, directory)
self.save_json(row, os.path.join(directory, source_directory))

# Object input methods
def add_table_data(self, data, table, fmt="csv"):
Expand Down Expand Up @@ -892,17 +917,21 @@ def load_json(self, filename):
temp_dict[self._foreign_key] = source
conn.execute(self.metadata.tables[key].insert().values(temp_dict))

def load_database(self, directory, verbose=False):
def load_database(self, directory: str, verbose: bool=False, reference_directory: str="reference", source_directory: str="source"):
"""
Reload entire database from a directory of JSON files.
Note that this will first clear existing tables.
Parameters
----------
directory : str
Name of directory containing the JSON files
Name of top-level directory containing the JSON files
verbose : bool
Flag to enable diagnostic messages
reference_directory : str
Relative path to sub-directory to use for reference JSON files (eg, data/reference)
source_directory : str
Relative path to sub-directory to use for source JSON files (eg, data/source)
"""

# Clear existing database contents
Expand All @@ -917,12 +946,24 @@ def load_database(self, directory, verbose=False):
for table in self._reference_tables:
if verbose:
print(f"Loading {table} table")
self.load_table(table, directory, verbose=verbose)
# Check if the reference table is in the sub-directory
if os.path.exists(os.path.join(directory, reference_directory, table+".json")):
self.load_table(table, os.path.join(directory, reference_directory), verbose=verbose)
else:
self.load_table(table, directory, verbose=verbose)

# Load object data
if verbose:
print("Loading object tables")
for file in tqdm(os.listdir(directory)):

# Check if the sources are in the sub-directory
if os.path.exists(os.path.join(directory, source_directory)):
directory_of_sources = os.path.join(directory, source_directory)
else:
directory_of_sources = directory

# Scan selected directory for JSON source files
for file in tqdm(os.listdir(directory_of_sources)):
# Skip reference tables
core_name = file.replace(".json", "")
if core_name in self._reference_tables:
Expand All @@ -932,7 +973,7 @@ def load_database(self, directory, verbose=False):
if not file.endswith(".json") or file.startswith("."):
continue

self.load_json(os.path.join(directory, file))
self.load_json(os.path.join(directory_of_sources, file))

def dump_sqlite(self, database_name):
"""Output database as a sqlite file"""
Expand Down
35 changes: 22 additions & 13 deletions astrodbkit2/tests/test_astrodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import json
import os
import shutil

import pandas as pd
import pytest
Expand Down Expand Up @@ -413,31 +414,35 @@ def test_views(db):

def test_save_reference_table(db, db_dir):
# Test saving a reference table
if os.path.exists(os.path.join(db_dir, 'Publications.json')):
os.remove(os.path.join(db_dir, 'Publications.json'))
db.save_reference_table('Publications', db_dir)
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
os.remove(os.path.join(db_dir, 'Publications.json')) # explicitly removing so that the next step will get verified
ref_dir = "reference"
if os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json')):
os.remove(os.path.join(db_dir, ref_dir, 'Publications.json'))
db.save_reference_table('Publications', db_dir, reference_directory=ref_dir)
assert os.path.exists(os.path.join(db_dir, ref_dir, 'Publications.json'))
os.remove(os.path.join(db_dir, ref_dir, 'Publications.json')) # explicitly removing so that the next step will get verified


def test_save_database(db, db_dir):
# Test saving the database to JSON files

# Clear temporary directory first
# if not os.path.exists(DB_DIR):
# os.mkdir(DB_DIR)
for file in os.listdir(db_dir):
os.remove(os.path.join(db_dir, file))
file_path = os.path.join(db_dir, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)

db.save_database(db_dir)

# Check JSON data
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
assert os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398.json'))
assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
assert os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'))
assert not os.path.exists(os.path.join(db_dir, '2mass_j13571237+1428398 2.json'))
assert not os.path.exists(os.path.join(db_dir, "source", '2mass_j13571237+1428398 2.json'))

# Load source and confirm it is the same
with open(os.path.join(db_dir, '2mass_j13571237+1428398.json'), 'r') as f:
with open(os.path.join(db_dir, "source", '2mass_j13571237+1428398.json'), 'r') as f:
data = json.load(f)
assert data == db.inventory('2MASS J13571237+1428398')

Expand All @@ -457,7 +462,7 @@ def test_load_database(db, db_dir):

# Reload the database and check DB contents
assert os.path.exists(db_dir)
assert os.path.exists(os.path.join(db_dir, 'Publications.json'))
assert os.path.exists(os.path.join(db_dir, "reference", 'Publications.json'))
db.load_database(db_dir, verbose=True)
assert db.query(db.Publications).count() == 2
assert db.query(db.Photometry).count() == 3
Expand All @@ -466,7 +471,11 @@ def test_load_database(db, db_dir):

# Clear temporary directory and files
for file in os.listdir(db_dir):
os.remove(os.path.join(db_dir, file))
file_path = os.path.join(db_dir, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)


def test_copy_database_schema():
Expand Down
21 changes: 14 additions & 7 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,18 @@ Loading the Database
--------------------

**Astrodbkit2** contains methods to output the full contents of the database as a list of JSON files.
It can likewise read in a directory of these files to populate the database.
This is how SIMPLE is currently version controlled. To load a database of this form, do the following::
It can likewise read in a directory of these files to populate the database.
By default, reference tables (eg, Publications, Telescopes, etc) and source tables are respectively stored in `reference/` and `source/` sub-directories of `data/`.
This is how SIMPLE is currently version controlled.

To load a database of this form, do the following::

from astrodbkit2.astrodb import Database

connection_string = 'sqlite:///SIMPLE.db' # SQLite connection string
db_dir = 'data' # directory where JSON files are located
db = Database(connection_string)
db.load_database(db_dir)
db.load_database(directory=db_dir, reference_directory="reference")

.. note:: Database contents are cleared when loading from JSON files to ensure that the database only contains
sources from on-disk files. We describe later how to use the :py:meth:`~astrodbkit2.astrodb.Database.save_db` method
Expand Down Expand Up @@ -406,17 +409,21 @@ Saving the Database
===================

If users perform changes to a database, they will want to output this to disk to be version controlled.
**Astrodbkit2** provides methods to save an individual source or reference table as well as the entire data.
We recommend the later to output the entire contents to disk::
**Astrodbkit2** provides methods to save an individual source or reference table as well as all of the data stored in the database.
By default, reference tables are stored in a sub-directory of `data/` called "reference"; this can be overwritten by
supplying a `reference_directory` variable into `save_database` or `save_reference_table`.
Similarly, source/object tables are stored in a sub-directory of `data/` called "source" which can be overwritten by supplying a `source_directory` variable.

We recommend using `save_database` as that outputs the entire database contents to disk::

# Save single object
db.save_json('2MASS J13571237+1428398', 'data')

# Save single reference table
db.save_reference_table('Publications', 'data')

# Save entire database to directory 'data'
db.save_database('data')
# Save entire database to directory 'data/' with 'reference/' and 'source/' subdirectories.
db.save_database(directory='data', reference_directory='reference', source_directory='source')

.. note:: To properly capture database deletes, the contents of the specified directory is first cleared before
creating JSON files representing the current state of the database.
Expand Down

0 comments on commit 4043a64

Please sign in to comment.