Skip to content

Commit

Permalink
Merge pull request #12 from tgrandje/dev
Browse files Browse the repository at this point in the history
Add superficial waterbodies quality coverage
  • Loading branch information
tgrandje authored Oct 1, 2024
2 parents 1b5dd10 + 9db2080 commit 3001aa4
Show file tree
Hide file tree
Showing 17 changed files with 2,435 additions and 348 deletions.
81 changes: 74 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ due time.
At this stage, the following APIs are covered by cl-hubeau:
* [piezometry/piézométrie](https://hubeau.eaufrance.fr/page/api-piezometrie)
* [hydrometry/hydrométrie](https://hubeau.eaufrance.fr/page/api-hydrometrie)
* [drinking water quality/qualité de l'eau potable](https://hubeau.eaufrance.fr/page/api-qualite-eau-potable#/qualite_eau_potable/communes)
* [drinking water quality/qualité de l'eau potable](https://hubeau.eaufrance.fr/page/api-qualite-eau-potable)
* [superficial waterbodies quality/qualité physico-chimique des cours d'eau'](https://hubeau.eaufrance.fr/page/api-qualite-cours-deau)

For any help on available kwargs for each endpoint, please refer
directly to the documentation on hubeau (this will not be covered
Expand Down Expand Up @@ -159,11 +160,6 @@ Note that this query is heavy, even if this was already restricted to nitrates.
In theory, you could also query the API without specifying the substance you're tracking,
but you may hit the 20k threshold and trigger an exception.

As it is, the `get_control_results` function already implements a double loop:

* on networks' codes (20 codes maximum) ;
* on periods, requesting only yearly datasets (which should be scalable over time **and** should work nicely with the cache algorithm).

You can also call the same function, using official city codes directly:
```python
df = drinking_water_quality.get_control_results(
Expand All @@ -185,4 +181,75 @@ with drinking_water_quality.DrinkingWaterQualitySession() as session:
df = session.get_cities_networks(nom_commune="LILLE")
df = session.get_control_results(code_departement='02', code_parametre="1340")

```
```

### Superficial waterbodies quality

4 high level functions are available (and one class for low level operations).


Get all stations (uses a 30 days caching):

```python
from cl_hubeau import superficial_waterbodies_quality
df = superficial_waterbodies_quality.get_all_stations()
```

Get all operations (uses a 30 days caching):

```python
from cl_hubeau import superficial_waterbodies_quality
df = superficial_waterbodies_quality.get_all_operations()
```

Note that this query is heavy, users should restrict it to a given territory.
For instance, you could use :
```python
df = superficial_waterbodies_quality.get_all_operations(code_region="11")
```

Get all environmental conditions:

```python
from cl_hubeau import superficial_waterbodies_quality
df = superficial_waterbodies_quality.get_all_environmental_conditions()
```

Note that this query is heavy, users should restrict it to a given territory.
For instance, you could use :
```python
df = superficial_waterbodies_quality.get_all_environmental_conditions(code_region="11")
```

Get all physicochemical analysis:
```python
from cl_hubeau import superficial_waterbodies_quality
df = superficial_waterbodies_quality.get_all_analysis()
```

Note that this query is heavy, users should restrict it to a given territory
and given parameters. For instance, you could use :
```python
df = superficial_waterbodies_quality.get_all_analysis(
code_departement="59",
code_parametre="1313"
)
```


Low level class to perform the same tasks:


Note that :

* the API is forbidding results > 20k rows and you may need inner loops
* the cache handling will be your responsibility

```python
with superficial_waterbodies_quality.SuperficialWaterbodiesQualitySession() as session:
df = session.get_stations(code_commune="59183")
df = session.get_operations(code_commune="59183")
df = session.get_environmental_conditions(code_commune="59183")
df = session.get_analysis(code_commune='59183', code_parametre="1340")

```
59 changes: 17 additions & 42 deletions cl_hubeau/drinking_water_quality/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Convenience functions for hydrometry consumption
"""

from datetime import date, datetime
from datetime import date
from itertools import product

import pandas as pd
Expand All @@ -13,7 +13,7 @@

from cl_hubeau.drinking_water_quality import DrinkingWaterQualitySession
from cl_hubeau import _config
from cl_hubeau.utils import get_cities
from cl_hubeau.utils import get_cities, prepare_kwargs_loops


def get_all_water_networks(**kwargs) -> pd.DataFrame:
Expand Down Expand Up @@ -66,8 +66,8 @@ def get_control_results(
Retrieve sanitary controls' results.
Uses a loop to avoid reaching 20k results threshold.
As queries may induce big datasets, loops are based on networks and years,
even if date_min_prelevement/date_max_prelevement are not set.
As queries may induce big datasets, loops are based on networks and 6 month
timeranges, even if date_min_prelevement/date_max_prelevement are not set.
Note that `codes_reseaux` and `codes_communes` are mutually exclusive!
Expand Down Expand Up @@ -112,41 +112,26 @@ def get_control_results(
if "date_max_prelevement" not in kwargs:
kwargs["date_max_prelevement"] = date.today().strftime("%Y-%m-%d")

ranges = pd.date_range(
start=datetime.strptime(
kwargs.pop("date_min_prelevement"), "%Y-%m-%d"
).date(),
end=datetime.strptime(
kwargs.pop("date_max_prelevement"), "%Y-%m-%d"
).date(),
kwargs_loop = prepare_kwargs_loops(
"date_min_prelevement",
"date_max_prelevement",
kwargs,
start_auto_determination,
)
dates = pd.Series(ranges).to_frame("date")
dates["year"] = dates["date"].dt.year
dates = dates.groupby("year")["date"].agg(["min", "max"])
for d in "min", "max":
dates[d] = dates[d].dt.strftime("%Y-%m-%d")
if start_auto_determination:
dates = pd.concat(
[
dates,
pd.DataFrame([{"min": "1900-01-01", "max": "2015-12-31"}]),
],
ignore_index=False,
).sort_index()

args = list(product(codes, dates.values.tolist()))

kwargs_loop = list(product(codes, kwargs_loop))
[kwargs.update({codes_names: chunk}) for chunk, kwargs in kwargs_loop]
kwargs_loop = [x[1] for x in kwargs_loop]

with DrinkingWaterQualitySession() as session:

results = [
session.get_control_results(
date_min_prelevement=date_min,
date_max_prelevement=date_max,
**{codes_names: chunk},
**kwargs
**kwargs,
**kw_loop,
)
for chunk, (date_min, date_max) in tqdm(
args,
for kw_loop in tqdm(
kwargs_loop,
desc="querying network/network and year/year",
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
Expand All @@ -155,13 +140,3 @@ def get_control_results(
results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = pd.concat(results, ignore_index=True)
return results


# if __name__ == "__main__":
# df = get_control_results(
# codes_communes="59350",
# code_parametre="1340",
# date_min_prelevement="2023-01-01",
# date_max_prelevement="2023-12-31",
# )
# print(df)
2 changes: 2 additions & 0 deletions cl_hubeau/hydrometry/hydrometry_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def get_observations(self, **kwargs):
)
except KeyError:
pass

return df

def get_realtime_observations(self, **kwargs):
Expand Down Expand Up @@ -382,6 +383,7 @@ def get_realtime_observations(self, **kwargs):
df["date_obs"] = pd.to_datetime(df["date_obs"])
except KeyError:
pass

return df


Expand Down
42 changes: 22 additions & 20 deletions cl_hubeau/piezometry/piezometry_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ def get_stations(self, **kwargs):
except KeyError:
pass

if kwargs:
raise ValueError(
f"found unexpected arguments {kwargs}, "
"please have a look at the documentation on "
"https://hubeau.eaufrance.fr/page/api-piezometrie"
)

method = "GET"
url = self.BASE_URL + "/v1/niveaux_nappes/stations"

Expand Down Expand Up @@ -122,12 +129,7 @@ def get_chronicles(self, **kwargs):
kwargs.pop("code_bss"), 200
)
except KeyError:
# reset to default hubeau value, which is set even when code_bss is
# missing
code_bss = "07548X0009/F"
msg = f"code_bss is missing, will be set to {code_bss=} by hubeau"
logging.warning(msg)
params["code_bss"] = code_bss
pass

for arg in "date_debut_mesure", "date_fin_mesure":
try:
Expand All @@ -153,6 +155,13 @@ def get_chronicles(self, **kwargs):
except KeyError:
pass

if kwargs:
raise ValueError(
f"found unexpected arguments {kwargs}, "
"please have a look at the documentation on "
"https://hubeau.eaufrance.fr/page/api-piezometrie"
)

method = "GET"
url = self.BASE_URL + "/v1/niveaux_nappes/chroniques"

Expand All @@ -165,13 +174,6 @@ def get_chronicles(self, **kwargs):
except KeyError:
pass

if kwargs:
raise ValueError(
f"found unexpected arguments {kwargs}, "
"please have a look at the documentation on "
"https://hubeau.eaufrance.fr/page/api-piezometrie"
)

return df

def get_realtime_chronicles(self, **kwargs):
Expand Down Expand Up @@ -250,6 +252,13 @@ def get_realtime_chronicles(self, **kwargs):
except KeyError:
pass

if kwargs:
raise ValueError(
f"found unexpected arguments {kwargs}, "
"please have a look at the documentation on "
"https://hubeau.eaufrance.fr/page/api-piezometrie"
)

method = "GET"
url = self.BASE_URL + "/v1/niveaux_nappes/chroniques_tr"

Expand All @@ -266,13 +275,6 @@ def get_realtime_chronicles(self, **kwargs):
except KeyError:
pass

if kwargs:
raise ValueError(
f"found unexpected arguments {kwargs}, "
"please have a look at the documentation on "
"https://hubeau.eaufrance.fr/page/api-piezometrie"
)

return df


Expand Down
7 changes: 6 additions & 1 deletion cl_hubeau/session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def request(
error = str(r.content)
raise ValueError(
f"Connection error on {method=} {url=} with {kwargs=}, "
f"got {error}"
f"got {error} : got result {r.status_code}"
)
return r

Expand Down Expand Up @@ -314,6 +314,11 @@ def get_result(
page = "page" if "page" in js["first"] else "cursor"

count_rows = js["count"]
if count_rows > 20_000:
raise ValueError(
"this request won't be handled by hubeau "
f"( {count_rows} > 20k results)"
)
msg = f"{count_rows} expected results"
logging.info(msg)
count_pages = count_rows // self.size + (
Expand Down
21 changes: 21 additions & 0 deletions cl_hubeau/superficial_waterbodies_quality/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-

from .superficial_waterbodies_quality_scraper import (
SuperficialWaterbodiesQualitySession,
)

from .utils import (
get_all_stations,
get_all_operations,
get_all_environmental_conditions,
get_all_analysis,
)


__all__ = [
"get_all_stations",
"get_all_operations",
"get_all_environmental_conditions",
"get_all_analysis",
"SuperficialWaterbodiesQualitySession",
]
Loading

0 comments on commit 3001aa4

Please sign in to comment.