Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ecoulement des cours d'eau #10

Merged
merged 7 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cl_hubeau/watercourses_flow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

from .watercourses_flow_scraper import WatercoursesFlowSession
from .utils import get_all_stations, get_all_observations


__all__ = [
"get_all_stations",
"get_all_observations",
"WatercoursesFlowSession",
]
189 changes: 189 additions & 0 deletions cl_hubeau/watercourses_flow/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
from datetime import date, datetime
from itertools import product

from cl_hubeau.watercourses_flow.watercourses_flow_scraper import (
WatercoursesFlowSession,
)
from cl_hubeau import _config
from cl_hubeau.utils import get_departements, prepare_kwargs_loops


def get_all_stations(**kwargs) -> gpd.GeoDataFrame:
"""
Retrieve all stations from France.

Parameters
----------
**kwargs :
kwargs passed to WatercoursesFlowSession.get_stations (hence mostly intended
for hub'eau API's arguments). Do not use `format` or `code_departement`
as they are set by the current function.

Returns
-------
results : gpd.GeoDataFrame
GeoDataFrame of stations

"""

with WatercoursesFlowSession() as session:

deps = get_departements()
results = [
session.get_stations(code_departement=dep, format="geojson", **kwargs)
for dep in tqdm(
deps,
desc="querying dep/dep",
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
)
]
results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = gpd.pd.concat(results, ignore_index=True)
try:
results["code_station"]
results = results.drop_duplicates("code_station")
except KeyError:
pass
return results


def get_all_observations(**kwargs) -> gpd.GeoDataFrame:
"""
Retrieve all observsations from France.

Parameters
----------
**kwargs :
kwargs passed to WatercoursesFlowSession.get_observations (hence mostly intended
for hub'eau API's arguments). Do not use `format` or `code_departement`
as they are set by the current function.

Returns
-------
results : gpd.GeoDataFrame
GeoDataFrame of observations
"""

deps = get_departements()

# Set a loop for yearly querying as dataset are big
start_auto_determination = False
if "date_observation_min" not in kwargs:
start_auto_determination = True
kwargs["date_observation_min"] = "1960-01-01"
if "date_observation_max" not in kwargs:
kwargs["date_observation_max"] = date.today().strftime("%Y-%m-%d")

# ranges = pd.date_range(
# start=datetime.strptime(kwargs.pop("date_observation_min"), "%Y-%m-%d").date(),
# end=datetime.strptime(kwargs.pop("date_observation_max"), "%Y-%m-%d").date(),
# )
# dates = pd.Series(ranges).to_frame("date")
# dates["year"] = dates["date"].dt.year
# dates = dates.groupby("year")["date"].agg(["min", "max"])
# for d in "min", "max":
# dates[d] = dates[d].dt.strftime("%Y-%m-%d")
# if start_auto_determination:
# dates = pd.concat(
# [
# dates,
# pd.DataFrame([{"min": "1900-01-01", "max": "2015-12-31"}]),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@B-Alica sur ta boucle interne, > 99% des doublons créés sont liés à cette ligne : tu as laissé un 2015-12-31 ici, alors que l'initialisation en ligne 77 était au 01/01/1960.

Copy link
Owner

@tgrandje tgrandje Oct 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mais ça n'explique pas les 50 doublons que j'ai toujours sur l'année 2023...

EDIT : pour être précis, il me reste 50 doublons sur 2023 et 18 sur 2012... 🙁

# ],
# ignore_index=False,
# ).sort_index()

# args = list(product(deps, dates.values.tolist()))

# with WatercoursesFlowSession() as session:

# results = [
# session.get_observations(
# format="geojson",
# date_observation_min=date_min,
# date_observation_max=date_max,
# **{"code_departement": chunk},
# **kwargs,
# )
# for chunk, (date_min, date_max) in tqdm(
# args,
# desc="querying station/station and year/year",
# leave=_config["TQDM_LEAVE"],
# position=tqdm._get_free_pos(),
# )
# ]

desc = "querying year/year" + (" & dep/dep" if "code_departement" in kwargs else "")

kwargs_loop = prepare_kwargs_loops(
"date_observation_min",
"date_observation_max",
kwargs,
start_auto_determination,
)

with WatercoursesFlowSession() as session:

results = [
session.get_observations(
format="geojson",
**kwargs,
**kw_loop,
)
for kw_loop in tqdm(
kwargs_loop,
desc=desc,
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
)
]

results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = pd.concat(results, ignore_index=True)
return results


def get_all_campagnes(**kwargs) -> gpd.GeoDataFrame:
"""
Retrieve all campagnes from France.

Parameters
----------
**kwargs :
kwargs passed to WatercoursesFlowSession.get_campagnes (hence mostly intended
for hub'eau API's arguments). Do not use `code_departement`
as they are set by the current function.

Returns
-------
results : gpd.GeoDataFrame
GeoDataFrame of campagnes
"""

with WatercoursesFlowSession() as session:
try:
results = session.get_campagnes(**kwargs)
except ValueError:
# If request is too big
deps = get_departements()
results = [
session.get_campagnes(code_departement=dep, **kwargs)
for dep in tqdm(
deps,
desc="querying dep/dep",
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
)
]
results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = gpd.pd.concat(results, ignore_index=True)
return results


# if __name__ == "__main__":
# # print(get_all_stations())
# # print(get_all_observations())
# print(get_all_campagnes())
Loading