Merge pull request #12 from tgrandje/dev

Add superficial waterbodies quality coverage
tgrandje · Oct 1, 2024 · 3001aa4 · 3001aa4
2 parents 1b5dd10 + 9db2080
commit 3001aa4
Show file tree

Hide file tree

Showing 17 changed files with 2,435 additions and 348 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,8 @@ due time.
 At this stage, the following APIs are covered by cl-hubeau:
 * [piezometry/piézométrie](https://hubeau.eaufrance.fr/page/api-piezometrie)
 * [hydrometry/hydrométrie](https://hubeau.eaufrance.fr/page/api-hydrometrie)
-* [drinking water quality/qualité de l'eau potable](https://hubeau.eaufrance.fr/page/api-qualite-eau-potable#/qualite_eau_potable/communes)
+* [drinking water quality/qualité de l'eau potable](https://hubeau.eaufrance.fr/page/api-qualite-eau-potable)
+* [superficial waterbodies quality/qualité physico-chimique des cours d'eau'](https://hubeau.eaufrance.fr/page/api-qualite-cours-deau)
 
 For any help on available kwargs for each endpoint, please refer 
 directly to the documentation on hubeau (this will not be covered
@@ -159,11 +160,6 @@ Note that this query is heavy, even if this was already restricted to nitrates.
 In theory, you could also query the API without specifying the substance you're tracking,
 but you may hit the 20k threshold and trigger an exception.
 
-As it is, the `get_control_results` function already implements a double loop:
-
-* on networks' codes (20 codes maximum) ;
-* on periods, requesting only yearly datasets (which should be scalable over time **and** should work nicely with the cache algorithm).
-
 You can also call the same function, using official city codes directly:
 ```python
 df = drinking_water_quality.get_control_results(
@@ -185,4 +181,75 @@ with drinking_water_quality.DrinkingWaterQualitySession() as session:
     df = session.get_cities_networks(nom_commune="LILLE")
     df = session.get_control_results(code_departement='02', code_parametre="1340")
 
-```
+```
+
+### Superficial waterbodies quality
+
+4 high level functions are available (and one class for low level operations).
+
+
+Get all stations (uses a 30 days caching):
+
+```python
+from cl_hubeau import superficial_waterbodies_quality 
+df = superficial_waterbodies_quality.get_all_stations()
+```
+
+Get all operations (uses a 30 days caching):
+
+```python
+from cl_hubeau import superficial_waterbodies_quality
+df = superficial_waterbodies_quality.get_all_operations()
+```
+
+Note that this query is heavy, users should restrict it to a given territory.
+For instance, you could use :
+```python
+df = superficial_waterbodies_quality.get_all_operations(code_region="11")
+```
+
+Get all environmental conditions:
+
+```python
+from cl_hubeau import superficial_waterbodies_quality
+df = superficial_waterbodies_quality.get_all_environmental_conditions()
+```
+
+Note that this query is heavy, users should restrict it to a given territory.
+For instance, you could use :
+```python
+df = superficial_waterbodies_quality.get_all_environmental_conditions(code_region="11")
+```
+
+Get all physicochemical analysis:
+```python
+from cl_hubeau import superficial_waterbodies_quality
+df = superficial_waterbodies_quality.get_all_analysis()
+```
+
+Note that this query is heavy, users should restrict it to a given territory
+and given parameters. For instance, you could use :
+```python
+df = superficial_waterbodies_quality.get_all_analysis(
+    code_departement="59", 
+    code_parametre="1313"
+    )
+```
+
+
+Low level class to perform the same tasks:
+
+
+Note that :
+
+* the API is forbidding results > 20k rows and you may need inner loops
+* the cache handling will be your responsibility
+
+```python
+with superficial_waterbodies_quality.SuperficialWaterbodiesQualitySession() as session:
+    df = session.get_stations(code_commune="59183")
+    df = session.get_operations(code_commune="59183")
+    df = session.get_environmental_conditions(code_commune="59183")
+    df = session.get_analysis(code_commune='59183', code_parametre="1340")
+
+```
diff --git a/cl_hubeau/drinking_water_quality/utils.py b/cl_hubeau/drinking_water_quality/utils.py
@@ -4,7 +4,7 @@
 Convenience functions for hydrometry consumption
 """
 
-from datetime import date, datetime
+from datetime import date
 from itertools import product
 
 import pandas as pd
@@ -13,7 +13,7 @@
 
 from cl_hubeau.drinking_water_quality import DrinkingWaterQualitySession
 from cl_hubeau import _config
-from cl_hubeau.utils import get_cities
+from cl_hubeau.utils import get_cities, prepare_kwargs_loops
 
 
 def get_all_water_networks(**kwargs) -> pd.DataFrame:
@@ -66,8 +66,8 @@ def get_control_results(
     Retrieve sanitary controls' results.
 
     Uses a loop to avoid reaching 20k results threshold.
-    As queries may induce big datasets, loops are based on networks and years,
-    even if date_min_prelevement/date_max_prelevement are not set.
+    As queries may induce big datasets, loops are based on networks and 6 month
+    timeranges, even if date_min_prelevement/date_max_prelevement are not set.
 
     Note that `codes_reseaux` and `codes_communes` are mutually exclusive!
 
@@ -112,41 +112,26 @@ def get_control_results(
     if "date_max_prelevement" not in kwargs:
         kwargs["date_max_prelevement"] = date.today().strftime("%Y-%m-%d")
 
-    ranges = pd.date_range(
-        start=datetime.strptime(
-            kwargs.pop("date_min_prelevement"), "%Y-%m-%d"
-        ).date(),
-        end=datetime.strptime(
-            kwargs.pop("date_max_prelevement"), "%Y-%m-%d"
-        ).date(),
+    kwargs_loop = prepare_kwargs_loops(
+        "date_min_prelevement",
+        "date_max_prelevement",
+        kwargs,
+        start_auto_determination,
     )
-    dates = pd.Series(ranges).to_frame("date")
-    dates["year"] = dates["date"].dt.year
-    dates = dates.groupby("year")["date"].agg(["min", "max"])
-    for d in "min", "max":
-        dates[d] = dates[d].dt.strftime("%Y-%m-%d")
-    if start_auto_determination:
-        dates = pd.concat(
-            [
-                dates,
-                pd.DataFrame([{"min": "1900-01-01", "max": "2015-12-31"}]),
-            ],
-            ignore_index=False,
-        ).sort_index()
-
-    args = list(product(codes, dates.values.tolist()))
+
+    kwargs_loop = list(product(codes, kwargs_loop))
+    [kwargs.update({codes_names: chunk}) for chunk, kwargs in kwargs_loop]
+    kwargs_loop = [x[1] for x in kwargs_loop]
 
     with DrinkingWaterQualitySession() as session:
 
         results = [
             session.get_control_results(
-                date_min_prelevement=date_min,
-                date_max_prelevement=date_max,
-                **{codes_names: chunk},
-                **kwargs
+                **kwargs,
+                **kw_loop,
             )
-            for chunk, (date_min, date_max) in tqdm(
-                args,
+            for kw_loop in tqdm(
+                kwargs_loop,
                 desc="querying network/network and year/year",
                 leave=_config["TQDM_LEAVE"],
                 position=tqdm._get_free_pos(),
@@ -155,13 +140,3 @@ def get_control_results(
     results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
     results = pd.concat(results, ignore_index=True)
     return results
-
-
-# if __name__ == "__main__":
-#     df = get_control_results(
-#         codes_communes="59350",
-#         code_parametre="1340",
-#         date_min_prelevement="2023-01-01",
-#         date_max_prelevement="2023-12-31",
-#     )
-#     print(df)
diff --git a/cl_hubeau/hydrometry/hydrometry_scraper.py b/cl_hubeau/hydrometry/hydrometry_scraper.py
@@ -264,6 +264,7 @@ def get_observations(self, **kwargs):
             )
         except KeyError:
             pass
+
         return df
 
     def get_realtime_observations(self, **kwargs):
@@ -382,6 +383,7 @@ def get_realtime_observations(self, **kwargs):
             df["date_obs"] = pd.to_datetime(df["date_obs"])
         except KeyError:
             pass
+
         return df
 
 

diff --git a/cl_hubeau/piezometry/piezometry_scraper.py b/cl_hubeau/piezometry/piezometry_scraper.py
@@ -92,6 +92,13 @@ def get_stations(self, **kwargs):
         except KeyError:
             pass
 
+        if kwargs:
+            raise ValueError(
+                f"found unexpected arguments {kwargs}, "
+                "please have a look at the documentation on "
+                "https://hubeau.eaufrance.fr/page/api-piezometrie"
+            )
+
         method = "GET"
         url = self.BASE_URL + "/v1/niveaux_nappes/stations"
 
@@ -122,12 +129,7 @@ def get_chronicles(self, **kwargs):
                 kwargs.pop("code_bss"), 200
             )
         except KeyError:
-            # reset to default hubeau value, which is set even when code_bss is
-            # missing
-            code_bss = "07548X0009/F"
-            msg = f"code_bss is missing, will be set to {code_bss=} by hubeau"
-            logging.warning(msg)
-            params["code_bss"] = code_bss
+            pass
 
         for arg in "date_debut_mesure", "date_fin_mesure":
             try:
@@ -153,6 +155,13 @@ def get_chronicles(self, **kwargs):
         except KeyError:
             pass
 
+        if kwargs:
+            raise ValueError(
+                f"found unexpected arguments {kwargs}, "
+                "please have a look at the documentation on "
+                "https://hubeau.eaufrance.fr/page/api-piezometrie"
+            )
+
         method = "GET"
         url = self.BASE_URL + "/v1/niveaux_nappes/chroniques"
 
@@ -165,13 +174,6 @@ def get_chronicles(self, **kwargs):
         except KeyError:
             pass
 
-        if kwargs:
-            raise ValueError(
-                f"found unexpected arguments {kwargs}, "
-                "please have a look at the documentation on "
-                "https://hubeau.eaufrance.fr/page/api-piezometrie"
-            )
-
         return df
 
     def get_realtime_chronicles(self, **kwargs):
@@ -250,6 +252,13 @@ def get_realtime_chronicles(self, **kwargs):
         except KeyError:
             pass
 
+        if kwargs:
+            raise ValueError(
+                f"found unexpected arguments {kwargs}, "
+                "please have a look at the documentation on "
+                "https://hubeau.eaufrance.fr/page/api-piezometrie"
+            )
+
         method = "GET"
         url = self.BASE_URL + "/v1/niveaux_nappes/chroniques_tr"
 
@@ -266,13 +275,6 @@ def get_realtime_chronicles(self, **kwargs):
         except KeyError:
             pass
 
-        if kwargs:
-            raise ValueError(
-                f"found unexpected arguments {kwargs}, "
-                "please have a look at the documentation on "
-                "https://hubeau.eaufrance.fr/page/api-piezometrie"
-            )
-
         return df
 
 

diff --git a/cl_hubeau/session/session.py b/cl_hubeau/session/session.py
@@ -256,7 +256,7 @@ def request(
                 error = str(r.content)
             raise ValueError(
                 f"Connection error on {method=} {url=} with {kwargs=}, "
-                f"got {error}"
+                f"got {error} : got result {r.status_code}"
             )
         return r
 
@@ -314,6 +314,11 @@ def get_result(
         page = "page" if "page" in js["first"] else "cursor"
 
         count_rows = js["count"]
+        if count_rows > 20_000:
+            raise ValueError(
+                "this request won't be handled by hubeau "
+                f"( {count_rows} > 20k results)"
+            )
         msg = f"{count_rows} expected results"
         logging.info(msg)
         count_pages = count_rows // self.size + (

diff --git a/cl_hubeau/superficial_waterbodies_quality/__init__.py b/cl_hubeau/superficial_waterbodies_quality/__init__.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+
+from .superficial_waterbodies_quality_scraper import (
+    SuperficialWaterbodiesQualitySession,
+)
+
+from .utils import (
+    get_all_stations,
+    get_all_operations,
+    get_all_environmental_conditions,
+    get_all_analysis,
+)
+
+
+__all__ = [
+    "get_all_stations",
+    "get_all_operations",
+    "get_all_environmental_conditions",
+    "get_all_analysis",
+    "SuperficialWaterbodiesQualitySession",
+]