Skip to content

Commit

Permalink
tons of changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kthyng committed Feb 23, 2023
1 parent f03dd8e commit 2f21ae0
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 197 deletions.
70 changes: 44 additions & 26 deletions intake_axds/axds.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from . import __version__
from .utils import (
check_station,
load_metadata,
make_data_url,
make_filter,
Expand All @@ -22,9 +23,9 @@ class AXDSSensorSource(base.DataSource):
Parameters
----------
internal_id : Optional[int], optional
Internal station id for Axiom, by default None. Not the UUID or dataset_id. Need to input internal_id or dataset_id. If both are input, be sure they are for the same station.
dataset_id : Optional[str], optional
The UUID for the station, by default None. Not the internal_id. Need to input internal_id or dataset_id. If both are input, be sure they are for the same station.
Internal station id for Axiom, by default None. Not the UUID. Need to input internal_id or UUID. If both are input, be sure they are for the same station.
uuid : Optional[str], optional
The UUID for the station, by default None. Not the internal_id. Need to input internal_id or UUID. If both are input, be sure they are for the same station. Note that there may also be a "datasetId" parameter which is sometimes but not always the same as the UUID.
start_time : Optional[str], optional
At what datetime for data to start, by default None. Must be interpretable by pandas ``Timestamp``. If not input, the datetime at which the dataset starts will be used.
end_time : Optional[str], optional
Expand Down Expand Up @@ -72,7 +73,7 @@ class AXDSSensorSource(base.DataSource):
def __init__(
self,
internal_id: Optional[int] = None,
dataset_id: Optional[str] = None,
uuid: Optional[str] = None,
start_time: Optional[str] = None,
end_time: Optional[str] = None,
qartod: Union[int, List[int], bool] = False,
Expand All @@ -83,12 +84,12 @@ def __init__(
only_pgids: Optional[List[int]] = None,
):

if internal_id is None and dataset_id is None:
if internal_id is None and uuid is None:
raise ValueError(
"internal_id and dataset_id cannot both be None. Input one of them."
"internal_id and uuid cannot both be None. Input one of them."
)

self.dataset_id = dataset_id
self.uuid = uuid
self.start_time = start_time
self.end_time = end_time
self.internal_id = internal_id
Expand All @@ -109,31 +110,48 @@ def __init__(

metadata = metadata or {}

if self.internal_id is None or self.uuid is None:
# uses whichever id is not None
url = make_search_docs_url(internal_id=self.internal_id, uuid=self.uuid)
result = response_from_url(url)[0]
assert isinstance(result, dict) # for mypy
metadata.update(load_metadata("sensor_station", result))
self.internal_id = metadata["internal_id"]
self.uuid = metadata["uuid"]
self.search_docs_url = url

# need dataset_id to get metadata
if self.dataset_id is None:
assert self.internal_id is not None
res = response_from_url(make_metadata_url(make_filter(self.internal_id)))
assert isinstance(res, dict)
self.dataset_id = res["data"]["stations"][0]["uuid"]
metadata["version"] = res["data"]["stations"][0]["version"]

# need internal_id to get data
elif self.internal_id is None:
assert self.dataset_id is not None
res = response_from_url(make_search_docs_url(self.dataset_id))[0]
assert isinstance(res, dict) # for mypy
self.internal_id = res["id"]
metadata["version"] = res["data"]["version"]
# or use UUID as another approach like if you have the dataset_id. Not sure why they are
# sometimes different
# if self.dataset_id is None or self.uuid is None:
# assert self.internal_id is not None
# # this works but maybe better to match in the two cases
# result = response_from_url(make_metadata_url(make_filter(self.internal_id)))
# assert isinstance(result, dict)
# metadata.update(load_metadata("sensor_station", result))
# self.dataset_id = metadata["datasetID"]
# self.uuid = metadata["uuid"]

# # need internal_id to get data
# elif self.internal_id is None:
# assert self.dataset_id is not None or self.uuid is not None
# result = response_from_url(make_search_docs_url(self.dataset_id))[0]
# assert isinstance(result, dict) # for mypy
# metadata.update(load_metadata("sensor_station", result))
# self.internal_id = metadata["internal_id"]
# self.uuid = metadata["uuid"]

# not checking for now
# # check station for if we want the output or not — for when source is used directly.
# _ = check_station(metadata, verbose=True)

self._dataframe = None

metadata["dataset_id"] = self.dataset_id

# this is what shows in the source if you print it
self._captured_init_kwargs.update(
{
"internal_id": self.internal_id,
"dataset_id": self.dataset_id,
"uuid": self.uuid,
"start_time": self.start_time,
"end_time": self.end_time,
"qartod": self.qartod,
Expand Down Expand Up @@ -358,7 +376,7 @@ def data_urls(self):
if not hasattr(self, "_data_urls"):

# get extended metadata which we need both for reading the data and as metadata
result = response_from_url(make_search_docs_url(self.dataset_id))[0]
result = response_from_url(make_search_docs_url(uuid=self.uuid))[0]
self.metadata.update(load_metadata("sensor_station", result))

start_time = self.start_time or self.metadata["minTime"]
Expand All @@ -374,7 +392,7 @@ def data_urls(self):
return self._data_urls

def _load(self):
"""How to load in a specific station once you know it by dataset_id"""
"""How to load in a specific station once you know it by uuid"""

dfs = [self._load_to_dataframe(url) for url in self.data_urls]

Expand Down
29 changes: 16 additions & 13 deletions intake_axds/axds_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
match_key_to_parameter,
match_std_names_to_parameter,
response_from_url,
check_station,
)


Expand Down Expand Up @@ -441,20 +442,20 @@ def _load_all_results(self) -> list:
return combined_results

def _load(self):
"""Find all dataset ids and create catalog."""
"""Find all UUIDs and create catalog."""

results = self._load_all_results()

self._entries = {}
for result in results:
dataset_id = result["uuid"]
uuid = result["uuid"]

# don't repeat an entry (it won't actually allow you to, but probably saves time not to try)
if dataset_id in self._entries:
if uuid in self._entries:
continue

if self.verbose:
print(f"Dataset ID: {dataset_id}")
print(f"Dataset ID: {uuid}")

# # quick check if OPENDAP is in the access methods for this uuid, otherwise move on
# if self.datatype == "module":
Expand All @@ -476,14 +477,12 @@ def _load(self):
# )
# continue

description = f"AXDS dataset_id {dataset_id} of datatype {self.datatype}"
description = f"AXDS dataset_id {uuid} of datatype {self.datatype}"

metadata = load_metadata(self.datatype, result)

# don't save Camera sensor data for now
if "webcam" in metadata["variables"]:
if self.verbose:
print(f"Dataset_id {dataset_id} is a webcam so is being skipped.")

keep_station = check_station(metadata, verbose=self.verbose)
if not keep_station:
continue

# Find urlpath
Expand All @@ -508,8 +507,8 @@ def _load(self):
# this Source has different arg requirements
elif self.datatype == "sensor_station":
args = {
"dataset_id": dataset_id,
"internal_id": metadata["internal_id"],
"uuid": uuid,
"start_time": self.kwargs_search.get("min_time", None),
"end_time": self.kwargs_search.get("max_time", None),
# "kwargs_search": self.kwargs_search,
Expand Down Expand Up @@ -564,7 +563,7 @@ def _load(self):
# urlpath = urlpaths[0]

entry = LocalCatalogEntry(
name=dataset_id,
name=uuid,
description=description,
driver=plugin,
direct_access="allow",
Expand All @@ -582,4 +581,8 @@ def _load(self):

entry._plugin = [plugin]

self._entries[dataset_id] = entry
self._entries[uuid] = entry

# final tally
if self.verbose:
print(f"Final number of stations found after removing some: {len(self._entries)}.")
Loading

0 comments on commit 2f21ae0

Please sign in to comment.