Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enchancing Metadata Support for SOAR #118

Merged
merged 24 commits into from
Jul 6, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
122e964
detector added
NucleonGodX May 27, 2024
0e3307b
test fixes/minor fixes
NucleonGodX May 28, 2024
70c4c32
added tests for detector and joins / combined join operations in a si…
NucleonGodX May 29, 2024
a7f7102
detector tests added, useful comments added to contruct methods
NucleonGodX May 30, 2024
3d15042
suggestions applied
NucleonGodX May 31, 2024
4748035
cleaned the code, all suggestions applied
NucleonGodX May 31, 2024
f5ac3ec
comment added for join condition
NucleonGodX Jun 1, 2024
8496fed
pre-commit fix
NucleonGodX Jun 1, 2024
f4340d8
code cleanup
NucleonGodX Jun 2, 2024
074bb5c
code cleanup
NucleonGodX Jun 2, 2024
f9a789c
wavelength added with testcases
NucleonGodX Jun 10, 2024
cec907d
updated changelog
NucleonGodX Jun 13, 2024
1873850
test case updated
NucleonGodX Jun 14, 2024
76e1900
Apply suggestions from code review
NucleonGodX Jun 20, 2024
d920659
suggestions applied
NucleonGodX Jun 20, 2024
d1b4510
Merge branch 'main' into gsoc24-soar
NucleonGodX Jun 20, 2024
6fc6921
test cases simplified
NucleonGodX Jun 22, 2024
f9352f1
Merge branch 'main' into gsoc24-soar
NucleonGodX Jun 27, 2024
52366cc
Update sunpy_soar/client.py
NucleonGodX Jun 27, 2024
6b838c3
tests updated
NucleonGodX Jun 27, 2024
183c2d8
phi and it's tests removed
NucleonGodX Jul 4, 2024
645aca5
removed low_latency detector test
NucleonGodX Jul 4, 2024
bd8bc31
code cleanup, h2. prefix removed from attrs file
NucleonGodX Jul 6, 2024
616f96b
Merge branch 'main' into gsoc24-soar
NucleonGodX Jul 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/118.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added support for ``detector`` and ``wavelength`` search attributes and the ability to filter with them.
12 changes: 12 additions & 0 deletions sunpy_soar/attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,15 @@ def _(wlk, attr, params): # NOQA: ARG001
@walker.add_applier(SOOP)
def _(wlk, attr, params): # NOQA: ARG001
params.append(f"soop_name='{attr.value}'")


@walker.add_applier(a.Detector)
def _(wlk, attr, params): # NOQA: ARG001
params.append(f"Detector='{attr.value}'")


@walker.add_applier(a.Wavelength)
def _(wlk, attr, params): # NOQA: ARG001
wavemin = attr.min.value
wavemax = attr.max.value
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
params.append(f"Wavemin='{wavemin}'+AND+h2.Wavemax='{wavemax}'")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm missing something, but isn't wavemin from h2 too?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if it is just parsed as a pattern in client.py, shouldn't it be consistent with wavemax?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think as we are giving both wavemin and wavemax as a single parameter for query, I think its one of the simpler ways to implement it. If its still necessary to try something else, let me know :)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it was just a thought about consistency between Wavemin and Wavemax.

144 changes: 125 additions & 19 deletions sunpy_soar/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import pathlib
import re

import astropy.table
import astropy.units as u
Expand Down Expand Up @@ -35,35 +36,132 @@ def search(self, *query, **kwargs): # NOQA: ARG002
qrt.hide_keys = ["Data item ID", "Filename"]
return qrt

def add_join_to_query(query: list[str], data_table: str, instrument_table: str):
"""
Construct the WHERE, FROM, and SELECT parts of the ADQL query.
nabobalis marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
query : list[str]
List of query items.
data_table : str
Name of the data table.
instrument_table : str
Name of the instrument table.

Returns
-------
tuple[str, str, str]
WHERE, FROM, and SELECT parts of the query.
"""
final_query = ""
# Extract wavemin and wavemax individually
wavemin_pattern = re.compile(r"Wavemin='(\d+\.\d+)'")
wavemax_pattern = re.compile(r"h2.Wavemax='(\d+\.\d+)'")
for parameter in query:
wavemin_match = wavemin_pattern.search(parameter)
wavemax_match = wavemax_pattern.search(parameter)
# If the wavemin and wavemax are same that means only one wavelength is given in query.
if wavemin_match and wavemax_match and float(wavemin_match.group(1)) == float(wavemax_match.group(1)):
# For PHI, we specify the wavemin as a parameter.
# This enables us to retrieve columns containing wavemin and their corresponding wavemax values.
# For SPICE we get data in form of wavemin/wavemax columns, but only for the first spectral window.
# Therefore, to make sure this data is not misleading to the user we do not return any values.
if "phi" in instrument_table:
parameter = f"Wavemin='{wavemin_match.group(1)}'"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment we don't consider wavelength for PHI and SPICE (see this comment).

# For other instruments, we provide the wavemin value to the Wavelength column.
# This gives us the Wavelength column in the output table.
else:
parameter = f"Wavelength='{wavemin_match.group(1)}'"
prefix = (
"h1."
if not parameter.startswith("Detector")
and not parameter.startswith("Wavelength")
and not parameter.startswith("Wavemin")
else "h2."
)
if parameter.startswith("begin_time"):
time_list = parameter.split("+AND+")
begin_start = f"h1.{time_list[0]}"
begin_end = f"h1.{time_list[1]}"
final_query += f"{begin_start}+AND+{begin_end}+AND+"
hayesla marked this conversation as resolved.
Show resolved Hide resolved
# As there are no dimensions in STIX, the dimension index need not be included in the query for STIX.
if "stx" not in instrument_table:
# To avoid duplicate rows in the output table, the dimension index is set to 1.
final_query += "h2.dimension_index='1'+AND+"
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
else:
final_query += f"{prefix}{parameter}+AND+"

where_part = final_query[:-5]
from_part = f"{data_table} AS h1"
select_part = (
"h1.instrument, h1.descriptor, h1.level, h1.begin_time, h1.end_time, "
"h1.data_item_id, h1.filesize, h1.filename, h1.soop_name"
)
if instrument_table:
from_part += f" JOIN {instrument_table} AS h2 USING (data_item_oid)"
# For EUI, METIS and SOLOHI, we query on basis of wavelength.
if "phi" not in instrument_table:
select_part += ", h2.detector, h2.wavelength, h2.dimension_index"
# For PHI, we query on basis of wavemin and wavemax.
else:
select_part += ", h2.detector, h2.wavemin, h2.wavemax, h2.dimension_index"
return where_part, from_part, select_part

@staticmethod
def _construct_payload(query):
"""
Construct search payload.

Parameters
----------
payload : dict[str]
Payload to send to the TAP server.
query : list[str]
List of query items.

Returns
-------
dict
Payload dictionary to be sent with the query.
"""
# Construct ADQL query
url_query = {}
url_query["SELECT"] = "*"
# Assume science data by default
url_query["FROM"] = "v_sc_data_item"
# Default data table
data_table = "v_sc_data_item"
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
instrument_table = None
# Mapping is established between the SOAR instrument names and its corresponding SOAR instrument table alias.
instrument_mapping = {
"SOLOHI": "SHI",
"EUI": "EUI",
"STIX": "STX",
"SPICE": "SPI",
"PHI": "PHI",
"METIS": "MET",
}

instrument_name = None
for q in query:
if q.startswith("level") and q.split("=")[1][1:3] == "LL":
# Low latency data
url_query["FROM"] = "v_ll_data_item"
if q.startswith("instrument") or q.startswith("descriptor") and not instrument_name:
instrument_name = q.split("=")[1][1:-1].split("-")[0].upper()
elif q.startswith("level") and q.split("=")[1][1:3] == "LL":
data_table = "v_ll_data_item"

url_query["WHERE"] = "+AND+".join(query)
adql_query = "+".join([f"{item}+{url_query[item]}" for item in url_query])
if instrument_name:
if instrument_name in instrument_mapping:
instrument_name = instrument_mapping[instrument_name]
instrument_table = f"v_{instrument_name.lower()}_sc_fits"
if data_table == "v_ll_data_item" and instrument_table:
instrument_table = instrument_table.replace("_sc_", "_ll_")

return {
"REQUEST": "doQuery",
"LANG": "ADQL",
"FORMAT": "json",
"QUERY": adql_query,
}
# Need to establish join for remote sensing instruments as they have instrument tables in SOAR.
if instrument_name in ["EUI", "STX", "MET", "SPI", "PHI", "SHI"]:
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
NucleonGodX marked this conversation as resolved.
Show resolved Hide resolved
where_part, from_part, select_part = SOARClient.add_join_to_query(query, data_table, instrument_table)
else:
from_part = data_table
select_part = "*"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need * here, as we select not all columns when a join is needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for instruments where we don't need the join.

where_part = "+AND+".join(query)

adql_query = {"SELECT": select_part, "FROM": from_part, "WHERE": where_part}

adql_query_str = "+".join([f"{key}+{value}" for key, value in adql_query.items()])
hayesla marked this conversation as resolved.
Show resolved Hide resolved
return {"REQUEST": "doQuery", "LANG": "ADQL", "FORMAT": "json", "QUERY": adql_query_str}

@staticmethod
def _do_search(query):
Expand Down Expand Up @@ -92,6 +190,7 @@ def _do_search(query):
# Do some list/dict wrangling
names = [m["name"] for m in r.json()["metadata"]]
info = {name: [] for name in names}

for entry in r.json()["data"]:
for i, name in enumerate(names):
info[name].append(entry[i])
Expand All @@ -113,6 +212,13 @@ def _do_search(query):
"SOOP Name": info["soop_name"],
},
)
if "detector" in info:
result_table["Detector"] = info["detector"]
if "wavelength" in info:
result_table["Wavelength"] = info["wavelength"]
if "wavemin" in info:
result_table["Wavemin"] = info["wavemin"]
result_table["Wavemax"] = info["wavemax"]
result_table.sort("Start time")
return result_table

Expand Down Expand Up @@ -160,7 +266,7 @@ def _can_handle_query(cls, *query):
True if this client can handle the given query.
"""
required = {a.Time}
optional = {a.Instrument, a.Level, a.Provider, Product, SOOP}
optional = {a.Instrument, a.Detector, a.Wavelength, a.Level, a.Provider, Product, SOOP}
if not cls.check_attr_types_in_query(query, required, optional):
return False
# check to make sure the instrument attr passed is one provided by the SOAR.
Expand Down
156 changes: 156 additions & 0 deletions sunpy_soar/tests/test_sunpy_soar.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,159 @@ def test_when_wrong_provider_passed():
provider = a.Provider.noaa
res = Fido.search(time & instrument & provider)
assert len(res) == 0


def test_search_detector_instrument_dimension_0():
# Since STIX data has no dimensions, there is no detector information available in the SOAR data.
# Therefore, the search results returned by the SOARClient will contain "None" as the value for the
# "Detector" column, indicating that no detector information is available for the given data.
instrument = a.Instrument("STIX")
time = a.Time("2023-03-03 00:00", "2023-03-03 23:59")
hayesla marked this conversation as resolved.
Show resolved Hide resolved
level = a.Level(1)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also a test for LL data?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there is test_search_low_latency() (for MAG data; executes the query), and the new test_join_low_latency_query() (for EUI; only tests the query string, without executing it). So maybe this is enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, will reverse this change!

res = Fido.search(instrument & time & level)
assert "Detector" in res[0].columns
assert res.file_num >= 35


def test_search_detector_instrument_dimension_2():
# Instruments "EUI","METIS","PHI" and "SOLOHI" have two dimensions in the SOAR data.
# Selecting no dimension index in the query results in two identical output rows.
# To avoid repeating data, we have methods to take dimension index=1, which avoids any repetition.
instrument = a.Instrument("EUI")
time = a.Time("2020-03-03", "2020-03-04")
level = a.Level(1)
detector = a.Detector("HRI_EUV")
res = Fido.search(instrument & time & level & detector)
assert "Detector" in res[0].columns
assert res.file_num == 266


def test_search_detector_instrument_dimension_4():
# The "SPICE" instrument has four dimensions in the SOAR data. As a result,
# selecting no dimension index in the query results in four identical output rows.
# To avoid repeating data, we have methods to take dimension index=1, which avoids any repetition.
instrument = a.Instrument("SPICE")
time = a.Time("2023-03-03 15:00", "2023-03-03 16:00")
level = a.Level(1)
detector = a.Detector("SW")
res = Fido.search(instrument & time & level & detector)
assert "Detector" in res[0].columns
assert res.file_num == 11


def test_invalid_detector():
instrument = a.Instrument("SPICE")
time = a.Time("2023-03-03 15:00", "2023-03-03 16:00")
level = a.Level(1)
detector = a.Detector("hello")
res = Fido.search(instrument & time & level & detector)
assert "Detector" in res[0].columns
assert res.file_num == 0


def test_wavelength_column_wavelength_exists():
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
# For instruments EUI, METIS and SOLOHI "wavelength" column is available.
# Test to check if the "Wavelength" column exists in the search results.
instrument = a.Instrument("EUI")
time = a.Time("2023-04-03 15:00", "2023-04-03 16:00")
level = a.Level(1)
wavelength = a.Wavelength(304 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
assert "Wavelength" in res[0].columns
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
assert res.file_num == 12


def test_wavelength_column_wavelength_single():
# Test to check if the wavelength value is filtered for a single value provided.
instrument = a.Instrument("EUI")
time = a.Time("2023-04-03 15:00", "2023-04-03 16:00")
level = a.Level(1)
wavelength = a.Wavelength(304 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
for table in res:
assert all(table["Wavelength"] == 304)


def test_wavelength_column_wavelength_range():
# Test to check if the wavelength value is filtered for wavemin and wavemax provided.
instrument = a.Instrument("EUI")
time = a.Time("2023-04-03 15:00", "2023-04-03 16:00")
level = a.Level(1)
wavelength = a.Wavelength(171 * u.AA, 185 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
for table in res:
assert all(table["Wavelength"] == 174)


def test_wavelength_column_wavemin_wavemax_exists():
# For Instrument PHI "Wavemin" and "Wavemax" columns are available.
# Test to check if the "wavemin" and "wavemax" columns exists in the search results.
instrument = a.Instrument("PHI")
time = a.Time("2023-02-01", "2023-02-02")
level = a.Level(2)
wavelength = a.Wavelength(6173.065 * u.AA, 6173.501 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
assert "Wavemin" in res[0].columns
nabobalis marked this conversation as resolved.
Show resolved Hide resolved
assert "Wavemax" in res[0].columns
assert res.file_num == 2


def test_wavelength_column_wavemin_wavemax_range():
# Test to check if the wavemin and wavemax value is filtered for a range of wavelength provided.
instrument = a.Instrument("PHI")
time = a.Time("2023-02-01", "2023-02-02")
level = a.Level(2)
wavelength = a.Wavelength(6173.065 * u.AA, 6173.501 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
for table in res:
assert all(table["Wavemin"] == 6173.065)
assert all(table["Wavemax"] == 6173.501)


def test_wavelength_column_wavemin_wavemax_single():
# Test to check if the wavemin and wavemax value is filtered for a value of wavemin provided.
instrument = a.Instrument("PHI")
time = a.Time("2023-02-01", "2023-02-02")
level = a.Level(2)
wavelength = a.Wavelength(6173.065 * u.AA)
res = Fido.search(instrument & time & level & wavelength)
for table in res:
assert all(table["Wavemin"] == 6173.065)


def test_join_science_query():
result = SOARClient._construct_payload( # NOQA: SLF001
[
"instrument='EUI'",
"begin_time>='2021-02-01+00:00:00'+AND+begin_time<='2021-02-02+00:00:00'",
"level='L1'",
"descriptor='eui-fsi174-image'",
]
)

assert result["QUERY"] == (
"SELECT+h1.instrument, h1.descriptor, h1.level, h1.begin_time, h1.end_time, "
"h1.data_item_id, h1.filesize, h1.filename, h1.soop_name, h2.detector, h2.wavelength, "
"h2.dimension_index+FROM+v_sc_data_item AS h1 JOIN v_eui_sc_fits AS h2 USING (data_item_oid)"
"+WHERE+h1.instrument='EUI'+AND+h1.begin_time>='2021-02-01+00:00:00'+AND+h1.begin_time<='2021-02-02+00:00:00'"
"+AND+h2.dimension_index='1'+AND+h1.level='L1'+AND+h1.descriptor='eui-fsi174-image'"
)


def test_join_low_latency_query():
result = SOARClient._construct_payload( # NOQA: SLF001
[
"instrument='EUI'",
"begin_time>='2021-02-01+00:00:00'+AND+begin_time<='2021-02-02+00:00:00'",
"level='LL01'",
"descriptor='eui-fsi174-image'",
]
)

assert result["QUERY"] == (
"SELECT+h1.instrument, h1.descriptor, h1.level, h1.begin_time, h1.end_time, "
"h1.data_item_id, h1.filesize, h1.filename, h1.soop_name, h2.detector, h2.wavelength, "
"h2.dimension_index+FROM+v_ll_data_item AS h1 JOIN v_eui_ll_fits AS h2 USING (data_item_oid)"
"+WHERE+h1.instrument='EUI'+AND+h1.begin_time>='2021-02-01+00:00:00'+AND+h1.begin_time<='2021-02-02+00:00:00'"
"+AND+h2.dimension_index='1'+AND+h1.level='LL01'+AND+h1.descriptor='eui-fsi174-image'"
)