tons of changes

axiom-data-science · Feb 23, 2023 · 2f21ae0 · 2f21ae0
1 parent f03dd8e
commit 2f21ae0
Show file tree

Hide file tree

Showing 5 changed files with 270 additions and 197 deletions.
diff --git a/intake_axds/axds.py b/intake_axds/axds.py
@@ -6,6 +6,7 @@
 
 from . import __version__
 from .utils import (
+    check_station,
     load_metadata,
     make_data_url,
     make_filter,
@@ -22,9 +23,9 @@ class AXDSSensorSource(base.DataSource):
     Parameters
     ----------
     internal_id : Optional[int], optional
-        Internal station id for Axiom, by default None. Not the UUID or dataset_id. Need to input internal_id or dataset_id. If both are input, be sure they are for the same station.
-    dataset_id : Optional[str], optional
-        The UUID for the station, by default None. Not the internal_id. Need to input internal_id or dataset_id. If both are input, be sure they are for the same station.
+        Internal station id for Axiom, by default None. Not the UUID. Need to input internal_id or UUID. If both are input, be sure they are for the same station.
+    uuid : Optional[str], optional
+        The UUID for the station, by default None. Not the internal_id. Need to input internal_id or UUID. If both are input, be sure they are for the same station. Note that there may also be a "datasetId" parameter which is sometimes but not always the same as the UUID.
     start_time : Optional[str], optional
         At what datetime for data to start, by default None. Must be interpretable by pandas ``Timestamp``. If not input, the datetime at which the dataset starts will be used.
     end_time : Optional[str], optional
@@ -72,7 +73,7 @@ class AXDSSensorSource(base.DataSource):
     def __init__(
         self,
         internal_id: Optional[int] = None,
-        dataset_id: Optional[str] = None,
+        uuid: Optional[str] = None,
         start_time: Optional[str] = None,
         end_time: Optional[str] = None,
         qartod: Union[int, List[int], bool] = False,
@@ -83,12 +84,12 @@ def __init__(
         only_pgids: Optional[List[int]] = None,
     ):
 
-        if internal_id is None and dataset_id is None:
+        if internal_id is None and uuid is None:
             raise ValueError(
-                "internal_id and dataset_id cannot both be None. Input one of them."
+                "internal_id and uuid cannot both be None. Input one of them."
             )
 
-        self.dataset_id = dataset_id
+        self.uuid = uuid
         self.start_time = start_time
         self.end_time = end_time
         self.internal_id = internal_id
@@ -109,31 +110,48 @@ def __init__(
 
         metadata = metadata or {}
 
+        if self.internal_id is None or self.uuid is None:
+            # uses whichever id is not None
+            url = make_search_docs_url(internal_id=self.internal_id, uuid=self.uuid)
+            result = response_from_url(url)[0]
+            assert isinstance(result, dict)  # for mypy
+            metadata.update(load_metadata("sensor_station", result))
+            self.internal_id = metadata["internal_id"]
+            self.uuid = metadata["uuid"]
+            self.search_docs_url = url
+
         # need dataset_id to get metadata
-        if self.dataset_id is None:
-            assert self.internal_id is not None
-            res = response_from_url(make_metadata_url(make_filter(self.internal_id)))
-            assert isinstance(res, dict)
-            self.dataset_id = res["data"]["stations"][0]["uuid"]
-            metadata["version"] = res["data"]["stations"][0]["version"]
-
-        # need internal_id to get data
-        elif self.internal_id is None:
-            assert self.dataset_id is not None
-            res = response_from_url(make_search_docs_url(self.dataset_id))[0]
-            assert isinstance(res, dict)  # for mypy
-            self.internal_id = res["id"]
-            metadata["version"] = res["data"]["version"]
+        # or use UUID as another approach like if you have the dataset_id. Not sure why they are
+        # sometimes different
+        # if self.dataset_id is None or self.uuid is None:
+        #     assert self.internal_id is not None
+            # # this works but maybe better to match in the two cases
+            # result = response_from_url(make_metadata_url(make_filter(self.internal_id)))
+            # assert isinstance(result, dict)
+            # metadata.update(load_metadata("sensor_station", result))
+            # self.dataset_id = metadata["datasetID"]
+            # self.uuid = metadata["uuid"]
+
+        # # need internal_id to get data
+        # elif self.internal_id is None:
+        #     assert self.dataset_id is not None or self.uuid is not None
+        #     result = response_from_url(make_search_docs_url(self.dataset_id))[0]
+        #     assert isinstance(result, dict)  # for mypy
+        #     metadata.update(load_metadata("sensor_station", result))
+        #     self.internal_id = metadata["internal_id"]
+        #     self.uuid = metadata["uuid"]
+
+        # not checking for now        
+        # # check station for if we want the output or not — for when source is used directly.
+        # _ = check_station(metadata, verbose=True)
 
         self._dataframe = None
 
-        metadata["dataset_id"] = self.dataset_id
-
         # this is what shows in the source if you print it
         self._captured_init_kwargs.update(
             {
                 "internal_id": self.internal_id,
-                "dataset_id": self.dataset_id,
+                "uuid": self.uuid,
                 "start_time": self.start_time,
                 "end_time": self.end_time,
                 "qartod": self.qartod,
@@ -358,7 +376,7 @@ def data_urls(self):
         if not hasattr(self, "_data_urls"):
 
             # get extended metadata which we need both for reading the data and as metadata
-            result = response_from_url(make_search_docs_url(self.dataset_id))[0]
+            result = response_from_url(make_search_docs_url(uuid=self.uuid))[0]
             self.metadata.update(load_metadata("sensor_station", result))
 
             start_time = self.start_time or self.metadata["minTime"]
@@ -374,7 +392,7 @@ def data_urls(self):
         return self._data_urls
 
     def _load(self):
-        """How to load in a specific station once you know it by dataset_id"""
+        """How to load in a specific station once you know it by uuid"""
 
         dfs = [self._load_to_dataframe(url) for url in self.data_urls]
 

diff --git a/intake_axds/axds_cat.py b/intake_axds/axds_cat.py
@@ -21,6 +21,7 @@
     match_key_to_parameter,
     match_std_names_to_parameter,
     response_from_url,
+    check_station,
 )
 
 
@@ -441,20 +442,20 @@ def _load_all_results(self) -> list:
         return combined_results
 
     def _load(self):
-        """Find all dataset ids and create catalog."""
+        """Find all UUIDs and create catalog."""
 
         results = self._load_all_results()
 
         self._entries = {}
         for result in results:
-            dataset_id = result["uuid"]
+            uuid = result["uuid"]
 
             # don't repeat an entry (it won't actually allow you to, but probably saves time not to try)
-            if dataset_id in self._entries:
+            if uuid in self._entries:
                 continue
 
             if self.verbose:
-                print(f"Dataset ID: {dataset_id}")
+                print(f"Dataset ID: {uuid}")
 
             # # quick check if OPENDAP is in the access methods for this uuid, otherwise move on
             # if self.datatype == "module":
@@ -476,14 +477,12 @@ def _load(self):
             #             )
             #         continue
 
-            description = f"AXDS dataset_id {dataset_id} of datatype {self.datatype}"
+            description = f"AXDS dataset_id {uuid} of datatype {self.datatype}"
 
             metadata = load_metadata(self.datatype, result)
-
-            # don't save Camera sensor data for now
-            if "webcam" in metadata["variables"]:
-                if self.verbose:
-                    print(f"Dataset_id {dataset_id} is a webcam so is being skipped.")
+
+            keep_station = check_station(metadata, verbose=self.verbose)
+            if not keep_station:
                 continue
 
             # Find urlpath
@@ -508,8 +507,8 @@ def _load(self):
             # this Source has different arg requirements
             elif self.datatype == "sensor_station":
                 args = {
-                    "dataset_id": dataset_id,
                     "internal_id": metadata["internal_id"],
+                    "uuid": uuid,
                     "start_time": self.kwargs_search.get("min_time", None),
                     "end_time": self.kwargs_search.get("max_time", None),
                     # "kwargs_search": self.kwargs_search,
@@ -564,7 +563,7 @@ def _load(self):
             #         urlpath = urlpaths[0]
 
             entry = LocalCatalogEntry(
-                name=dataset_id,
+                name=uuid,
                 description=description,
                 driver=plugin,
                 direct_access="allow",
@@ -582,4 +581,8 @@ def _load(self):
 
             entry._plugin = [plugin]
 
-            self._entries[dataset_id] = entry
+            self._entries[uuid] = entry
+
+        # final tally
+        if self.verbose:
+            print(f"Final number of stations found after removing some: {len(self._entries)}.")