Skip to content

Commit

Permalink
feature/scrape-dataframe (#52)
Browse files Browse the repository at this point in the history
* scrape_dataframe method added for scraping Fits data from dataframe

* update tests
  • Loading branch information
alphasentaurii authored Apr 3, 2024
1 parent 0638f12 commit 10c8f12
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 58 deletions.
2 changes: 2 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ new features

- `extractor.radio.JwstCalRadio` subclass for scraping datasets from MAST using ASN metadata [#51]

- `extractor.scrape.FitsScraper.scrape_dataframe` method added for scraping Fits data from dataframe [#52]


1.0.1 (2024-04-03)
==================
Expand Down
90 changes: 33 additions & 57 deletions spacekit/extractor/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,25 @@ def scrape_fits_headers(self, fpaths=None, **kwargs):
continue
return exp_headers

def scrape_dataframe(self, dnames=None, dname_col="dname"):
if dnames is None:
dnames = list(self.df[dname_col])
exp_headers = {}
for name in dnames:
try:
data = self.df.loc[name]
exp_headers[name] = dict()
if self.genkeys:
for g in self.genkeys:
exp_headers[name][g] = data[g] if g in self.df.columns else "NaN"
if self.scikeys:
for s in self.scikeys:
exp_headers[name][s] = data[s] if s in self.df.columns else "NaN"
except Exception:
del exp_headers[name]
continue
return exp_headers

def find_drz_paths(self, dname_col="dataset", drzimg_col="imgname"):
"""Looks for SVM input files based on information contained in the ``self.df`` attribute.
Input paths for files are constructed using the ``dname_col`` and ``drzimg_col`` along with
Expand Down Expand Up @@ -929,6 +948,12 @@ def __init__(self, input_path, data=None, pfx="", sfx="_uncal.fits", **log_kws):
self.exp_headers = None

def general_header_keys(self):
"""General header key names to scrape from input exposure fits files.
Returns
-------
list
list of key names to scrape from fits header extension 0.
"""
return [
"PROGRAM", # Program number
"OBSERVTN", # Observation number
Expand All @@ -945,8 +970,10 @@ def general_header_keys(self):
"FILTER", # Name of the filter element used
"PUPIL", # Name of the pupil element used
"GRATING", # Name of the grating element used (SPEC)
"FXD_SLIT", # Name of fixed slit aperture used
"EXP_TYPE", # Type of data in the exposure
"CHANNEL", # Instrument channel
"BAND", # MRS wavelength band
"SUBARRAY", # Subarray used
"NUMDTHPT", # Total number of points in pattern
"GS_RA", # guide star right ascension
Expand All @@ -956,6 +983,12 @@ def general_header_keys(self):
]

def science_header_keys(self):
"""Science header key names to scrape from input exposure fits files science headers.
Returns
-------
list
list of key names to scrape from fits header science extension headers.
"""
return [
"RA_REF",
"DEC_REF",
Expand Down Expand Up @@ -1417,60 +1450,3 @@ def make_dataframe_line(self, json_filename_list):
class ImageScraper(Scraper):
def __init__(self):
super().__init__()


# def extract_archives(zipfiles, extract_to="data", delete_archive=False):
# fpaths = []
# os.makedirs(extract_to, exist_ok=True)
# for z in zipfiles:
# fname = os.path.basename(z).split(".")[0]
# fpath = os.path.join(extract_to, fname)
# with ZipFile(z, "r") as zip_ref:
# zip_ref.extractall(extract_to)
# # check just in case
# if os.path.exists(fpath):
# fpaths.append(fpath)
# if delete_archive is True:
# os.remove(z)
# return fpaths


# def unzip_images(zip_file):
# basedir = os.path.dirname(zip_file)
# key = os.path.basename(zip_file).split(".")[0]
# image_folder = os.path.join(basedir, key + "/")
# os.makedirs(image_folder, exist_ok=True)
# with ZipFile(zip_file, "r") as zip_ref:
# zip_ref.extractall(basedir)
# print(len(os.listdir(image_folder)))
# return image_folder


# def scrape_web(key, uri):
# fname = key["fname"]
# origin = f"{uri}/{fname}"
# hash = key["hash"]
# fpath = get_file(
# origin=origin,
# file_hash=hash,
# hash_algorithm="sha256", # auto
# cache_dir="~",
# cache_subdir="data",
# extract=True,
# archive_format="zip",
# )
# if os.path.exists(fpath):
# os.remove(f"data/{fname}")
# return fpath


# def get_training_data(dataset=None, uri=None):
# if uri is None:
# print("Please enter a uri.")
# return None
# keys = list(dataset.keys())
# fpaths = []
# for key in keys:
# fpath = scrape_web(key, uri)
# fpaths.append(fpath)
# return fpaths
6 changes: 6 additions & 0 deletions tests/extractor/test_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@
'FILTER': 'F150W',
'PUPIL': 'CLEAR',
'GRATING': 'N/A',
'FXD_SLIT': 'NaN',
'EXP_TYPE': 'NRC_IMAGE',
'CHANNEL': 'SHORT',
'BAND': 'NaN',
'SUBARRAY': 'FULL',
'NUMDTHPT': 5,
'GS_RA': 339.0885699921705,
Expand Down Expand Up @@ -78,7 +80,9 @@
'PUPIL': 'N/A',
'GRATING': 'N/A',
'EXP_TYPE': 'MIR_IMAGE',
'FXD_SLIT': 'NaN',
'CHANNEL': 'N/A',
'BAND': 'NaN',
'SUBARRAY': 'FULL',
'NUMDTHPT': 2,
'GS_RA': 339.061734363325,
Expand Down Expand Up @@ -106,8 +110,10 @@
'FILTER': 'CLEAR',
'PUPIL': 'F150W',
'GRATING': 'N/A',
'FXD_SLIT': 'NaN',
'EXP_TYPE': 'NIS_IMAGE',
'CHANNEL': 'NONE',
'BAND': 'NaN',
'SUBARRAY': 'FULL',
'NUMDTHPT': 4,
'GS_RA': 80.3457270042323,
Expand Down
2 changes: 1 addition & 1 deletion tests/preprocessor/test_scrub.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_jwst_cal_scrubber(jwstcal_input_path):
assert len(scrubber.imgpix) == 3
imgpix_products = list(scrubber.imgpix.keys())
for product in imgpix_products:
assert len(scrubber.imgpix[product].keys()) == 46
assert len(scrubber.imgpix[product].keys()) == 48
image_inputs = scrubber.scrub_inputs(exp_type="IMAGE")
assert len(image_inputs) == 3
assert list(image_inputs.columns) == JWST_SCRUBBED_COLS

0 comments on commit 10c8f12

Please sign in to comment.