diff --git a/CHANGELOG.md b/CHANGELOG.md index 307dc507c..b28dbe018 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [UNRELEASED] - YYYY-MM-DD +### Changed + +- [#950](https://github.com/equinor/webviz-subsurface/pull/950) - Revised `EnsembleSummaryProvider` to support finer control of which range of dates get returned when using lazy resampling and requesting data for multiple realizations. Note that this is a breaking change in the `EnsembleSummaryProvider` interface, see PR for more details. + ## [0.2.11] - 2022-03-14 ### Added diff --git a/tests/unit_tests/plugin_tests/test_simulation_time_series/mocks/ensemble_summary_provider_dummy.py b/tests/unit_tests/plugin_tests/test_simulation_time_series/mocks/ensemble_summary_provider_dummy.py index 894fe0610..884bb9814 100644 --- a/tests/unit_tests/plugin_tests/test_simulation_time_series/mocks/ensemble_summary_provider_dummy.py +++ b/tests/unit_tests/plugin_tests/test_simulation_time_series/mocks/ensemble_summary_provider_dummy.py @@ -4,8 +4,10 @@ import pandas as pd from webviz_subsurface._providers import ( + DateSpan, EnsembleSummaryProvider, Frequency, + ResamplingOptions, VectorMetadata, ) @@ -43,6 +45,7 @@ def vector_names_filtered_by_value( def dates( self, resampling_frequency: Optional[Frequency], + date_span: DateSpan = DateSpan.UNION, realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: raise NotImplementedError("Method not implemented for mock!") @@ -53,7 +56,7 @@ def supports_resampling(self) -> bool: def get_vectors_df( self, vector_names: Sequence[str], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: raise NotImplementedError("Method not implemented for mock!") diff --git a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_delta_ensemble_vectors_accessor_impl.py b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_delta_ensemble_vectors_accessor_impl.py index 3db2cd210..b88072e8a 100644 --- a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_delta_ensemble_vectors_accessor_impl.py +++ b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_delta_ensemble_vectors_accessor_impl.py @@ -162,7 +162,7 @@ ), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=[TEST_EXPRESSION], - resampling_frequency=None, + resampling_options=None, ) TEST_AFTER_2262_ACCESSOR = DerivedDeltaEnsembleVectorsAccessorImpl( @@ -173,7 +173,7 @@ ), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=[TEST_EXPRESSION], - resampling_frequency=None, + resampling_options=None, ) TEST_EMPTY_ACCESSOR = DerivedDeltaEnsembleVectorsAccessorImpl( @@ -184,7 +184,7 @@ ), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=None, - resampling_frequency=None, + resampling_options=None, ) # ******************************************************************* diff --git a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_ensemble_vectors_accessor_impl.py b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_ensemble_vectors_accessor_impl.py index 84cef6210..2cc7229fe 100644 --- a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_ensemble_vectors_accessor_impl.py +++ b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_types/test_derived_ensemble_vectors_accessor_impl.py @@ -119,7 +119,7 @@ provider=EnsembleSummaryProviderMock(INPUT_DF), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=[TEST_EXPRESSION], - resampling_frequency=None, + resampling_options=None, ) TEST_AFTER_2262_ACCESSOR = DerivedEnsembleVectorsAccessorImpl( @@ -127,7 +127,7 @@ provider=EnsembleSummaryProviderMock(INPUT_AFTER_2262_DF), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=[TEST_EXPRESSION], - resampling_frequency=None, + resampling_options=None, ) TEST_EMPTY_ACCESSOR = DerivedEnsembleVectorsAccessorImpl( @@ -135,7 +135,7 @@ provider=EnsembleSummaryProviderMock(pd.DataFrame()), vectors=["A", "B", "PER_INTVL_B", "Sum A and B"], expressions=None, - resampling_frequency=None, + resampling_options=None, ) diff --git a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_utils/test_derived_ensemble_vectors_accessor_utils.py b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_utils/test_derived_ensemble_vectors_accessor_utils.py index f5ca59caf..a2e6bd05c 100644 --- a/tests/unit_tests/plugin_tests/test_simulation_time_series/test_utils/test_derived_ensemble_vectors_accessor_utils.py +++ b/tests/unit_tests/plugin_tests/test_simulation_time_series/test_utils/test_derived_ensemble_vectors_accessor_utils.py @@ -53,7 +53,7 @@ def test_create_derived_vectors_accessor_dict() -> None: provider_set=provider_set, expressions=[], delta_ensembles=delta_ensembles, - resampling_frequency=None, + resampling_options=None, relative_date=None, ) diff --git a/tests/unit_tests/provider_tests/test_ensemble_summary_provider.py b/tests/unit_tests/provider_tests/test_ensemble_summary_provider.py index 07b1e8e05..8aafd8526 100644 --- a/tests/unit_tests/provider_tests/test_ensemble_summary_provider.py +++ b/tests/unit_tests/provider_tests/test_ensemble_summary_provider.py @@ -9,6 +9,7 @@ from webviz_subsurface._providers import ( EnsembleSummaryProviderFactory, Frequency, + ResamplingOptions, VectorMetadata, ) @@ -80,7 +81,9 @@ def test_create_from_arrow_unsmry_lazy(testdata_folder: Path, tmp_path: Path) -> assert realizations[0] == 0 assert realizations[-1] == 99 - vecdf = provider.get_vectors_df(["FOPR"], Frequency.MONTHLY) + vecdf = provider.get_vectors_df( + ["FOPR"], ResamplingOptions(frequency=Frequency.MONTHLY) + ) assert vecdf.shape == (3100, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "FOPR"] assert vecdf["DATE"].nunique() == 31 @@ -88,7 +91,9 @@ def test_create_from_arrow_unsmry_lazy(testdata_folder: Path, tmp_path: Path) -> sampleddate = vecdf["DATE"][0] assert isinstance(sampleddate, datetime.datetime) - vecdf = provider.get_vectors_df(["FOPR"], Frequency.MONTHLY, [5]) + vecdf = provider.get_vectors_df( + ["FOPR"], ResamplingOptions(frequency=Frequency.MONTHLY), [5] + ) assert vecdf.shape == (31, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "FOPR"] assert vecdf["DATE"].nunique() == 31 diff --git a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_lazy.py b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_lazy.py index c06a70872..6d9e39900 100644 --- a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_lazy.py +++ b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_lazy.py @@ -7,11 +7,13 @@ import pyarrow.compute as pc from webviz_subsurface._providers.ensemble_summary_provider._provider_impl_arrow_lazy import ( - Frequency, ProviderImplArrowLazy, ) from webviz_subsurface._providers.ensemble_summary_provider.ensemble_summary_provider import ( + DateSpan, EnsembleSummaryProvider, + Frequency, + ResamplingOptions, ) @@ -133,31 +135,67 @@ def test_get_vector_names(tmp_path: Path) -> None: assert len(all_realizations) == 2 -def test_get_dates_without_resampling(tmp_path: Path) -> None: +def test_get_dates_intersection_without_resampling(tmp_path: Path) -> None: # fmt:off input_data = [ ["DATE", "REAL", "A"], - [np.datetime64("2023-12-20", "ms"), 0, 10.0], - [np.datetime64("2023-12-20", "ms"), 1, 12.0], - [np.datetime64("2023-12-21", "ms"), 1, 13.0], + [np.datetime64("2020-12-10", "ms"), 1, 1.0], + [np.datetime64("2020-12-20", "ms"), 1, 2.0], + [np.datetime64("2020-12-10", "ms"), 2, 3.0], + [np.datetime64("2020-12-15", "ms"), 2, 4.0], + [np.datetime64("2020-12-20", "ms"), 2, 5.0], + [np.datetime64("2099-12-01", "ms"), 9, 6.0], ] # fmt:on provider = _create_provider_obj_with_data(input_data, tmp_path) all_realizations = provider.realizations() - assert len(all_realizations) == 2 + assert len(all_realizations) == 3 - all_dates = provider.dates(resampling_frequency=None) - assert len(all_dates) == 1 - assert isinstance(all_dates[0], datetime) + # Intersection accross all realizations is empty + all_real_dates = provider.dates(None, DateSpan.INTERSECTION) + assert len(all_real_dates) == 0 - r0_dates = provider.dates(resampling_frequency=None, realizations=[0]) - r1_dates = provider.dates(resampling_frequency=None, realizations=[1]) - assert len(r0_dates) == 1 + r12_dates = provider.dates(None, DateSpan.INTERSECTION, [1, 2]) + assert len(r12_dates) == 2 + assert r12_dates == sorted(r12_dates) + assert isinstance(r12_dates[0], datetime) + + r1_dates = provider.dates(None, DateSpan.INTERSECTION, [1]) + r2_dates = provider.dates(None, DateSpan.INTERSECTION, [2]) assert len(r1_dates) == 2 + assert len(r2_dates) == 3 + assert r1_dates == sorted(r1_dates) + assert r2_dates == sorted(r2_dates) + + +def test_get_dates_union_without_resampling(tmp_path: Path) -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "A"], + [np.datetime64("2020-12-15", "ms"), 1, 1.0], + [np.datetime64("2020-12-10", "ms"), 2, 2.0], + [np.datetime64("2020-12-20", "ms"), 2, 3.0], + ] + # fmt:on + provider = _create_provider_obj_with_data(input_data, tmp_path) + + all_realizations = provider.realizations() + assert len(all_realizations) == 2 + + union_of_dates = provider.dates(None, DateSpan.UNION) + assert len(union_of_dates) == 3 + assert union_of_dates == sorted(union_of_dates) + r1_dates = provider.dates(None, DateSpan.UNION, [1]) + r2_dates = provider.dates(None, DateSpan.UNION, [2]) + assert len(r1_dates) == 1 + assert len(r2_dates) == 2 + assert r1_dates == sorted(r1_dates) + assert r2_dates == sorted(r2_dates) -def test_get_dates_with_daily_resampling(tmp_path: Path) -> None: + +def test_get_dates_union_with_daily_resampling(tmp_path: Path) -> None: # fmt:off input_data = [ ["DATE", "REAL", "A",], @@ -171,21 +209,51 @@ def test_get_dates_with_daily_resampling(tmp_path: Path) -> None: all_realizations = provider.realizations() assert len(all_realizations) == 2 - all_dates = provider.dates(resampling_frequency=Frequency.DAILY) - assert len(all_dates) == 6 - assert isinstance(all_dates[0], datetime) - assert all_dates[0] == datetime(2020, 1, 1) - assert all_dates[1] == datetime(2020, 1, 2) - assert all_dates[4] == datetime(2020, 1, 5) - assert all_dates[5] == datetime(2020, 1, 6) - - r0_dates = provider.dates(resampling_frequency=Frequency.DAILY, realizations=[0]) + union_dates = provider.dates(Frequency.DAILY, DateSpan.UNION) + assert len(union_dates) == 6 + assert isinstance(union_dates[0], datetime) + assert union_dates[0] == datetime(2020, 1, 1) + assert union_dates[1] == datetime(2020, 1, 2) + assert union_dates[2] == datetime(2020, 1, 3) + assert union_dates[3] == datetime(2020, 1, 4) + assert union_dates[4] == datetime(2020, 1, 5) + assert union_dates[5] == datetime(2020, 1, 6) + + r0_dates = provider.dates(Frequency.DAILY, DateSpan.UNION, realizations=[0]) assert len(r0_dates) == 4 - r1_dates = provider.dates(resampling_frequency=Frequency.DAILY, realizations=[1]) + r1_dates = provider.dates(Frequency.DAILY, DateSpan.UNION, realizations=[1]) assert len(r1_dates) == 1 +def test_get_dates_intersection_with_daily_resampling(tmp_path: Path) -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "A",], + [np.datetime64("2020-01-01", "ms"), 0, 10.0], + [np.datetime64("2020-01-05", "ms"), 0, 20.0], + [np.datetime64("2020-01-04", "ms"), 1, 30.0], + [np.datetime64("2020-01-06", "ms"), 1, 40.0], + ] + # fmt:on + provider = _create_provider_obj_with_data(input_data, tmp_path) + + all_realizations = provider.realizations() + assert len(all_realizations) == 2 + + isect_dates = provider.dates(Frequency.DAILY, DateSpan.INTERSECTION) + assert len(isect_dates) == 2 + assert isinstance(isect_dates[0], datetime) + assert isect_dates[0] == datetime(2020, 1, 4) + assert isect_dates[1] == datetime(2020, 1, 5) + + r0_dates = provider.dates(Frequency.DAILY, DateSpan.INTERSECTION, realizations=[0]) + assert len(r0_dates) == 5 + + r1_dates = provider.dates(Frequency.DAILY, DateSpan.INTERSECTION, realizations=[1]) + assert len(r1_dates) == 3 + + def test_get_vector_metadata(tmp_path: Path) -> None: # fmt:off input_data = [ @@ -225,19 +293,19 @@ def test_get_vectors_without_resampling(tmp_path: Path) -> None: all_vecnames = provider.vector_names() assert len(all_vecnames) == 2 - vecdf = provider.get_vectors_df(["A"], resampling_frequency=None) + vecdf = provider.get_vectors_df(["A"], resampling_options=None) assert vecdf.shape == (3, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "A"] sampleddate = vecdf["DATE"][0] assert isinstance(sampleddate, datetime) - vecdf = provider.get_vectors_df(["A"], resampling_frequency=None, realizations=[1]) + vecdf = provider.get_vectors_df(["A"], resampling_options=None, realizations=[1]) assert vecdf.shape == (2, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "A"] vecdf = provider.get_vectors_df( - ["B", "A"], resampling_frequency=None, realizations=[0] + ["B", "A"], resampling_options=None, realizations=[0] ) assert vecdf.shape == (1, 4) assert vecdf.columns.tolist() == ["DATE", "REAL", "B", "A"] @@ -247,42 +315,196 @@ def test_get_vectors_with_daily_resampling(tmp_path: Path) -> None: # fmt:off input_data = [ ["DATE", "REAL", "TOT_t", "RATE_r"], - [np.datetime64("2020-01-01", "ms"), 1, 10.0, 1.0], - [np.datetime64("2020-01-04", "ms"), 1, 40.0, 4.0], - [np.datetime64("2020-01-06", "ms"), 1, 60.0, 6.0], + [np.datetime64("2020-01-01", "ms"), 0, 10.0, 1.0], + [np.datetime64("2020-01-04", "ms"), 0, 40.0, 4.0], + [np.datetime64("2020-01-06", "ms"), 0, 60.0, 6.0], + [np.datetime64("2020-01-05", "ms"), 1, 99.0, 9.0], + ] + # fmt:on + provider = _create_provider_obj_with_data(input_data, tmp_path) + + vecdf = provider.get_vectors_df( + ["TOT_t", "RATE_r"], ResamplingOptions(Frequency.DAILY, None) + ) + + r0_vecdf = vecdf[(vecdf["REAL"] == 0)] + r1_vecdf = vecdf[(vecdf["REAL"] == 1)] + + r0_date_arr = r0_vecdf["DATE"].to_numpy() + assert len(r0_date_arr) == 6 + assert r0_date_arr[0] == np.datetime64("2020-01-01", "ms") + assert r0_date_arr[1] == np.datetime64("2020-01-02", "ms") + assert r0_date_arr[2] == np.datetime64("2020-01-03", "ms") + assert r0_date_arr[3] == np.datetime64("2020-01-04", "ms") + assert r0_date_arr[4] == np.datetime64("2020-01-05", "ms") + assert r0_date_arr[5] == np.datetime64("2020-01-06", "ms") + + r1_date_arr = r1_vecdf["DATE"].to_numpy() + assert len(r1_date_arr) == 1 + assert r1_date_arr[0] == np.datetime64("2020-01-05", "ms") + + # Check interpolation for the total column + r0_tot_arr = r0_vecdf["TOT_t"].to_numpy() + assert r0_tot_arr[0] == 10 + assert r0_tot_arr[1] == 20 + assert r0_tot_arr[2] == 30 + assert r0_tot_arr[3] == 40 + assert r0_tot_arr[4] == 50 + assert r0_tot_arr[5] == 60 + + r1_tot_arr = r1_vecdf["TOT_t"].to_numpy() + assert r1_tot_arr[0] == 99 + + # Check backfill for the rate column + r0_rate_arr = r0_vecdf["RATE_r"].to_numpy() + assert r0_rate_arr[0] == 1 + assert r0_rate_arr[1] == 4 + assert r0_rate_arr[2] == 4 + assert r0_rate_arr[3] == 4 + assert r0_rate_arr[4] == 6 + assert r0_rate_arr[5] == 6 + + r1_rate_arr = r1_vecdf["RATE_r"].to_numpy() + assert r1_rate_arr[0] == 9 + + +def test_get_vectors_with_monthly_resampling(tmp_path: Path) -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "TOT_t", "RATE_r"], + [np.datetime64("2020-01-27", "ms"), 0, 10.0, 1.0], + [np.datetime64("2020-02-06", "ms"), 0, 20.0, 2.0], + [np.datetime64("2020-03-01", "ms"), 0, 30.0, 3.0], + [np.datetime64("2020-03-15", "ms"), 0, 40.0, 4.0], ] # fmt:on provider = _create_provider_obj_with_data(input_data, tmp_path) vecdf = provider.get_vectors_df( - ["TOT_t", "RATE_r"], resampling_frequency=Frequency.DAILY + ["TOT_t", "RATE_r"], ResamplingOptions(frequency=Frequency.MONTHLY) ) date_arr = vecdf["DATE"].to_numpy() + assert len(date_arr) == 4 assert date_arr[0] == np.datetime64("2020-01-01", "ms") - assert date_arr[1] == np.datetime64("2020-01-02", "ms") - assert date_arr[2] == np.datetime64("2020-01-03", "ms") - assert date_arr[3] == np.datetime64("2020-01-04", "ms") - assert date_arr[4] == np.datetime64("2020-01-05", "ms") - assert date_arr[5] == np.datetime64("2020-01-06", "ms") + assert date_arr[1] == np.datetime64("2020-02-01", "ms") + assert date_arr[2] == np.datetime64("2020-03-01", "ms") + assert date_arr[3] == np.datetime64("2020-04-01", "ms") - # Check interpolation for the total column tot_arr = vecdf["TOT_t"].to_numpy() assert tot_arr[0] == 10 - assert tot_arr[1] == 20 + assert tot_arr[1] == 15 assert tot_arr[2] == 30 assert tot_arr[3] == 40 - assert tot_arr[4] == 50 - assert tot_arr[5] == 60 - # Check backfill for the rate column - tot_arr = vecdf["RATE_r"].to_numpy() - assert tot_arr[0] == 1 - assert tot_arr[1] == 4 - assert tot_arr[2] == 4 - assert tot_arr[3] == 4 - assert tot_arr[4] == 6 - assert tot_arr[5] == 6 + # Backfill for the rate column + rate_arr = vecdf["RATE_r"].to_numpy() + assert rate_arr[0] == 0 + assert rate_arr[1] == 2 + assert rate_arr[2] == 3 + assert rate_arr[3] == 0 + + +def test_get_vectors_with_monthly_resampling_union(tmp_path: Path) -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "TOT_t", "RATE_r"], + [np.datetime64("2020-01-27", "ms"), 0, 10, 1], + [np.datetime64("2020-02-06", "ms"), 0, 20, 2], + [np.datetime64("2020-03-01", "ms"), 0, 30, 3], + [np.datetime64("2020-03-15", "ms"), 0, 40, 4], + [np.datetime64("2020-02-01", "ms"), 1, 992, 92], + [np.datetime64("2020-03-01", "ms"), 1, 993, 93], + ] + # fmt:on + provider = _create_provider_obj_with_data(input_data, tmp_path) + + vecdf = provider.get_vectors_df( + ["TOT_t", "RATE_r"], ResamplingOptions(Frequency.MONTHLY, DateSpan.UNION) + ) + assert vecdf.shape == (8, 4) + + r0_vecdf = vecdf[(vecdf["REAL"] == 0)] + r1_vecdf = vecdf[(vecdf["REAL"] == 1)] + assert r0_vecdf.shape == (4, 4) + assert r1_vecdf.shape == (4, 4) + + r0_date_arr = r0_vecdf["DATE"].to_numpy() + r1_date_arr = r1_vecdf["DATE"].to_numpy() + assert r0_date_arr[0] == r1_date_arr[0] == np.datetime64("2020-01-01", "ms") + assert r0_date_arr[1] == r1_date_arr[1] == np.datetime64("2020-02-01", "ms") + assert r0_date_arr[2] == r1_date_arr[2] == np.datetime64("2020-03-01", "ms") + assert r0_date_arr[3] == r1_date_arr[3] == np.datetime64("2020-04-01", "ms") + + r0_tot_arr = r0_vecdf["TOT_t"].to_numpy() + assert r0_tot_arr[0] == 10 + assert r0_tot_arr[1] == 15 + assert r0_tot_arr[2] == 30 + assert r0_tot_arr[3] == 40 + + r1_tot_arr = r1_vecdf["TOT_t"].to_numpy() + assert r1_tot_arr[0] == 992 + assert r1_tot_arr[1] == 992 + assert r1_tot_arr[2] == 993 + assert r1_tot_arr[3] == 993 + + r0_rate_arr = r0_vecdf["RATE_r"].to_numpy() + assert r0_rate_arr[0] == 0 + assert r0_rate_arr[1] == 2 + assert r0_rate_arr[2] == 3 + assert r0_rate_arr[3] == 0 + + r1_rate_arr = r1_vecdf["RATE_r"].to_numpy() + assert r1_rate_arr[0] == 0 + assert r1_rate_arr[1] == 92 + assert r1_rate_arr[2] == 93 + assert r1_rate_arr[3] == 0 + + +def test_get_vectors_with_monthly_resampling_intersection(tmp_path: Path) -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "TOT_t", "RATE_r"], + [np.datetime64("2020-01-27", "ms"), 0, 10, 1], + [np.datetime64("2020-02-06", "ms"), 0, 20, 2], + [np.datetime64("2020-03-01", "ms"), 0, 30, 3], + [np.datetime64("2020-03-15", "ms"), 0, 40, 4], + [np.datetime64("2020-02-01", "ms"), 1, 992, 92], + [np.datetime64("2020-03-01", "ms"), 1, 993, 93], + ] + # fmt:on + provider = _create_provider_obj_with_data(input_data, tmp_path) + + vecdf = provider.get_vectors_df( + ["TOT_t", "RATE_r"], ResamplingOptions(Frequency.MONTHLY, DateSpan.INTERSECTION) + ) + assert vecdf.shape == (4, 4) + + r0_vecdf = vecdf[(vecdf["REAL"] == 0)] + r1_vecdf = vecdf[(vecdf["REAL"] == 1)] + assert r0_vecdf.shape == (2, 4) + assert r1_vecdf.shape == (2, 4) + + r0_date_arr = r0_vecdf["DATE"].to_numpy() + r1_date_arr = r1_vecdf["DATE"].to_numpy() + assert r0_date_arr[0] == r1_date_arr[0] == np.datetime64("2020-02-01", "ms") + assert r0_date_arr[1] == r1_date_arr[1] == np.datetime64("2020-03-01", "ms") + + r0_tot_arr = r0_vecdf["TOT_t"].to_numpy() + assert r0_tot_arr[0] == 15 + assert r0_tot_arr[1] == 30 + + r1_tot_arr = r1_vecdf["TOT_t"].to_numpy() + assert r1_tot_arr[0] == 992 + assert r1_tot_arr[1] == 993 + + r0_rate_arr = r0_vecdf["RATE_r"].to_numpy() + assert r0_rate_arr[0] == 2 + assert r0_rate_arr[1] == 3 + + r1_rate_arr = r1_vecdf["RATE_r"].to_numpy() + assert r1_rate_arr[0] == 92 + assert r1_rate_arr[1] == 93 def test_get_vectors_for_date_without_resampling(tmp_path: Path) -> None: @@ -296,22 +518,24 @@ def test_get_vectors_for_date_without_resampling(tmp_path: Path) -> None: # fmt:on provider = _create_provider_obj_with_data(input_data, tmp_path) - all_dates = provider.dates(resampling_frequency=None) - assert len(all_dates) == 1 + common_dates = provider.dates( + resampling_frequency=None, date_span=DateSpan.INTERSECTION + ) + assert len(common_dates) == 1 - date_to_get = all_dates[0] + date_to_get = common_dates[0] assert isinstance(date_to_get, datetime) vecdf = provider.get_vectors_for_date_df(date_to_get, ["A"]) assert vecdf.shape == (2, 2) assert vecdf.columns.tolist() == ["REAL", "A"] - date_to_get = all_dates[0] + date_to_get = common_dates[0] vecdf = provider.get_vectors_for_date_df(date_to_get, ["A", "B"], [0]) assert vecdf.shape == (1, 3) assert vecdf.columns.tolist() == ["REAL", "A", "B"] - date_to_get = all_dates[0] + date_to_get = common_dates[0] vecdf = provider.get_vectors_for_date_df(date_to_get, ["A", "C"], [0]) assert vecdf.shape == (1, 3) assert vecdf.columns.tolist() == ["REAL", "A", "C"] diff --git a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_presampled.py b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_presampled.py index e91022fba..9048afe67 100644 --- a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_presampled.py +++ b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_impl_arrow_presampled.py @@ -9,6 +9,7 @@ ProviderImplArrowPresampled, ) from webviz_subsurface._providers.ensemble_summary_provider.ensemble_summary_provider import ( + DateSpan, EnsembleSummaryProvider, ) @@ -97,14 +98,26 @@ def test_get_realizations(provider: EnsembleSummaryProvider) -> None: assert len(all_realizations) == 2 -def test_get_dates(provider: EnsembleSummaryProvider) -> None: +def test_get_dates_intersection(provider: EnsembleSummaryProvider) -> None: - intersection_of_dates = provider.dates(resampling_frequency=None) + intersection_of_dates = provider.dates(None, DateSpan.INTERSECTION) assert len(intersection_of_dates) == 1 assert isinstance(intersection_of_dates[0], datetime) - r0_dates = provider.dates(resampling_frequency=None, realizations=[0]) - r1_dates = provider.dates(resampling_frequency=None, realizations=[1]) + r0_dates = provider.dates(None, DateSpan.INTERSECTION, realizations=[0]) + r1_dates = provider.dates(None, DateSpan.INTERSECTION, realizations=[1]) + assert len(r0_dates) == 1 + assert len(r1_dates) == 2 + + +def test_get_dates_union(provider: EnsembleSummaryProvider) -> None: + + union_of_dates = provider.dates(None, DateSpan.UNION) + assert len(union_of_dates) == 2 + assert isinstance(union_of_dates[0], datetime) + + r0_dates = provider.dates(None, DateSpan.UNION, realizations=[0]) + r1_dates = provider.dates(None, DateSpan.UNION, realizations=[1]) assert len(r0_dates) == 1 assert len(r1_dates) == 2 @@ -114,19 +127,19 @@ def test_get_vectors(provider: EnsembleSummaryProvider) -> None: all_vecnames = provider.vector_names() assert len(all_vecnames) == 3 - vecdf = provider.get_vectors_df(["A"], resampling_frequency=None) + vecdf = provider.get_vectors_df(["A"], resampling_options=None) assert vecdf.shape == (3, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "A"] sampleddate = vecdf["DATE"][0] assert isinstance(sampleddate, datetime) - vecdf = provider.get_vectors_df(["A"], resampling_frequency=None, realizations=[1]) + vecdf = provider.get_vectors_df(["A"], resampling_options=None, realizations=[1]) assert vecdf.shape == (2, 3) assert vecdf.columns.tolist() == ["DATE", "REAL", "A"] vecdf = provider.get_vectors_df( - ["C", "A"], resampling_frequency=None, realizations=[0] + ["C", "A"], resampling_options=None, realizations=[0] ) assert vecdf.shape == (1, 4) assert vecdf.columns.tolist() == ["DATE", "REAL", "C", "A"] @@ -134,7 +147,7 @@ def test_get_vectors(provider: EnsembleSummaryProvider) -> None: def test_get_vectors_for_date(provider: EnsembleSummaryProvider) -> None: - intersection_of_dates = provider.dates(resampling_frequency=None) + intersection_of_dates = provider.dates(None, DateSpan.INTERSECTION) assert len(intersection_of_dates) == 1 date_to_get = intersection_of_dates[0] diff --git a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_resampling.py b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_resampling.py index 28409bb0c..b7f08bc40 100644 --- a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_resampling.py +++ b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_resampling.py @@ -3,7 +3,9 @@ from webviz_subsurface._providers.ensemble_summary_provider._resampling import ( Frequency, + calc_intersection_of_normalized_date_intervals, generate_normalized_sample_dates, + get_normalized_min_max_sample_date, interpolate_backfill, sample_segmented_multi_real_table_at_date, ) @@ -44,6 +46,23 @@ def test_generate_sample_dates_daily() -> None: assert dates[0] == np.datetime64("2020-12-30") assert dates[-1] == np.datetime64("2021-01-06") + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-01-20"), np.datetime64("2020-01-20"), Frequency.DAILY + ) + assert len(dates) == 1 + assert dates[0] == np.datetime64("2020-01-20") + + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-01-20T01:30"), + np.datetime64("2020-01-20T01:30"), + Frequency.DAILY, + ) + assert len(dates) == 2 + assert dates[0] == np.datetime64("2020-01-20") + assert dates[1] == np.datetime64("2020-01-21") + def test_generate_sample_dates_weekly() -> None: @@ -69,6 +88,23 @@ def test_generate_sample_dates_weekly() -> None: assert dates[0] == np.datetime64("2020-12-21") assert dates[-1] == np.datetime64("2021-01-11") + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-12-21"), np.datetime64("2020-12-21"), Frequency.WEEKLY + ) + assert len(dates) == 1 + assert dates[0] == np.datetime64("2020-12-21") + + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-12-22"), + np.datetime64("2020-12-22"), + Frequency.WEEKLY, + ) + assert len(dates) == 2 + assert dates[0] == np.datetime64("2020-12-21") + assert dates[1] == np.datetime64("2020-12-28") + def test_generate_sample_dates_monthly() -> None: @@ -88,6 +124,13 @@ def test_generate_sample_dates_monthly() -> None: assert dates[0] == np.datetime64("2020-12-01") assert dates[-1] == np.datetime64("2022-02-01") + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-12-01"), np.datetime64("2020-12-01"), Frequency.MONTHLY + ) + assert len(dates) == 1 + assert dates[0] == np.datetime64("2020-12-01") + def test_generate_sample_dates_yearly() -> None: @@ -114,6 +157,104 @@ def test_generate_sample_dates_yearly() -> None: assert dates[0] == np.datetime64("2020-01-01") assert dates[-1] == np.datetime64("2023-01-01") + # Same min and max raw date + dates = generate_normalized_sample_dates( + np.datetime64("2020-01-01"), np.datetime64("2020-01-01"), Frequency.YEARLY + ) + assert len(dates) == 1 + assert dates[0] == np.datetime64("2020-01-01") + + +def test_get_normalized_min_max_sample_dates() -> None: + + # Daily + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2020-12-30"), np.datetime64("2020-12-30"), Frequency.DAILY + ) + assert min_date == np.datetime64("2020-12-30") + assert max_date == np.datetime64("2020-12-30") + + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2020-12-30"), np.datetime64("2021-01-05"), Frequency.DAILY + ) + assert min_date == np.datetime64("2020-12-30") + assert max_date == np.datetime64("2021-01-05") + + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2020-12-30T01:30"), + np.datetime64("2021-01-05T02:30"), + Frequency.DAILY, + ) + assert min_date == np.datetime64("2020-12-30") + assert max_date == np.datetime64("2021-01-06") + + # Weekly + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2020-12-20"), + np.datetime64("2021-01-21"), + Frequency.WEEKLY, + ) + assert min_date == np.datetime64("2020-12-14") + assert max_date == np.datetime64("2021-01-25") + + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2021-01-25"), + np.datetime64("2021-01-26"), + Frequency.WEEKLY, + ) + assert min_date == np.datetime64("2021-01-25") + assert max_date == np.datetime64("2021-02-01") + + # Monthly + min_date, max_date = get_normalized_min_max_sample_date( + np.datetime64("2021-02-01"), + np.datetime64("2021-02-02"), + Frequency.MONTHLY, + ) + assert min_date == np.datetime64("2021-02-01") + assert max_date == np.datetime64("2021-03-01") + + # TODO: Test for the rest of the frequencies + # : + + +def test_find_normalized_intersection_of_date_intervals() -> None: + intervals = [ + (np.datetime64("2020-01-20"), np.datetime64("2020-01-22")), + (np.datetime64("2020-01-22"), np.datetime64("2020-01-29")), + ] + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.DAILY) + assert isect is not None + assert isect[0] == np.datetime64("2020-01-22") + assert isect[1] == np.datetime64("2020-01-22") + + intervals = [ + (np.datetime64("2020-01-20"), np.datetime64("2020-01-22")), + (np.datetime64("2029-01-31"), np.datetime64("2029-01-31")), + ] + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.DAILY) + assert isect is None + + intervals = [ + (np.datetime64("2020-12-20"), np.datetime64("2021-01-21")), + (np.datetime64("2021-02-01"), np.datetime64("2021-02-02")), + ] + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.DAILY) + assert isect is None + + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.WEEKLY) + assert isect is None + + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.MONTHLY) + assert isect is not None + assert isect[0] == np.datetime64("2021-02-01") + assert isect[1] == np.datetime64("2021-02-01") + + isect = calc_intersection_of_normalized_date_intervals(intervals, Frequency.YEARLY) + assert isect is not None + assert isect[0] == np.datetime64("2021-01-01") + assert isect[1] == np.datetime64("2022-01-01") + def test_interpolate_backfill() -> None: diff --git a/tests/unit_tests/provider_tests/test_ensemble_summary_provider_table_utils.py b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_table_utils.py new file mode 100644 index 000000000..28b29ebe4 --- /dev/null +++ b/tests/unit_tests/provider_tests/test_ensemble_summary_provider_table_utils.py @@ -0,0 +1,115 @@ +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from webviz_subsurface._providers.ensemble_summary_provider._table_utils import ( + find_intersection_of_realization_dates, + find_min_max_date_per_realization, + find_union_of_realization_dates, +) + +# Since PyArrow's actual compute functions are not seen by pylint +# pylint: disable=no-member + + +def _create_table_from_row_data(per_row_input_data: list) -> pa.Table: + # Turn rows into columns + columns_with_header = list(zip(*per_row_input_data)) + + input_dict = {} + field_list = [] + for col in columns_with_header: + colname = col[0] + coldata = col[1:] + input_dict[colname] = coldata + + if colname == "DATE": + field_list.append(pa.field("DATE", pa.timestamp("ms"))) + elif colname == "REAL": + field_list.append(pa.field("REAL", pa.int64())) + else: + field_list.append(pa.field(colname, pa.float32())) + + table = pa.Table.from_pydict(input_dict, schema=pa.schema(field_list)) + + return table + + +def test_intersection_of_dates() -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "V"], + [np.datetime64("2020-01-03", "ms"), 1, 30.0], + [np.datetime64("2020-01-02", "ms"), 1, 20.0], + [np.datetime64("2020-01-01", "ms"), 1, 10.0], + [np.datetime64("2020-01-01", "ms"), 2, 10.0], + [np.datetime64("2020-01-02", "ms"), 2, 20.0], + [np.datetime64("2099-01-02", "ms"), 3, 20.0], + ] + # fmt:on + full_table = _create_table_from_row_data(input_data) + + # Intersection should end up empty due to outlier in real 3 + date_list_full = find_intersection_of_realization_dates(full_table).tolist() + assert len(date_list_full) == 0 + + r12_table = full_table.filter( + pc.is_in(full_table["REAL"], value_set=pa.array([1, 2])) + ) + date_list_r12 = find_intersection_of_realization_dates(r12_table).tolist() + assert date_list_r12 == sorted(date_list_r12) + assert len(date_list_r12) == 2 + + r1_table = full_table.filter(pc.equal(full_table["REAL"], 1)) + date_list_r1 = find_intersection_of_realization_dates(r1_table).tolist() + assert date_list_r1 == sorted(date_list_r1) + assert len(date_list_r1) == 3 + + empty_table = full_table.filter(pc.equal(full_table["REAL"], 99)) + date_list_empty = find_intersection_of_realization_dates(empty_table).tolist() + assert len(date_list_empty) == 0 + + +def test_union_of_dates() -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "V"], + [np.datetime64("2020-01-03", "ms"), 1, 20.0], + [np.datetime64("2020-01-02", "ms"), 1, 10.0], + [np.datetime64("2020-01-01", "ms"), 2, 10.0], + ] + # fmt:on + table = _create_table_from_row_data(input_data) + + date_list = find_union_of_realization_dates(table).tolist() + assert date_list == sorted(date_list) + assert len(date_list) == 3 + + empty_table = table.filter(pc.equal(table["REAL"], 99)) + date_list_empty = find_union_of_realization_dates(empty_table).tolist() + assert len(date_list_empty) == 0 + + +def test_find_min_max_date_per_realization() -> None: + # fmt:off + input_data = [ + ["DATE", "REAL", "V"], + [np.datetime64("2020-01-03", "ms"), 1, 20.0], + [np.datetime64("2020-01-02", "ms"), 1, 10.0], + [np.datetime64("2020-01-01", "ms"), 2, 10.0], + ] + # fmt:on + table = _create_table_from_row_data(input_data) + + minmaxlist = find_min_max_date_per_realization(table) + assert len(minmaxlist) == 2 + + assert isinstance(minmaxlist[0][0], np.datetime64) + + r1_min_max = minmaxlist[0] + assert r1_min_max[0] == np.datetime64("2020-01-02", "ms") + assert r1_min_max[1] == np.datetime64("2020-01-03", "ms") + + r2_min_max = minmaxlist[1] + assert r2_min_max[0] == np.datetime64("2020-01-01", "ms") + assert r2_min_max[1] == np.datetime64("2020-01-01", "ms") diff --git a/webviz_subsurface/_providers/__init__.py b/webviz_subsurface/_providers/__init__.py index a5ae86359..10cd2a959 100644 --- a/webviz_subsurface/_providers/__init__.py +++ b/webviz_subsurface/_providers/__init__.py @@ -6,8 +6,10 @@ SimulatedFaultPolygonsAddress, ) from .ensemble_summary_provider.ensemble_summary_provider import ( + DateSpan, EnsembleSummaryProvider, Frequency, + ResamplingOptions, VectorMetadata, ) from .ensemble_summary_provider.ensemble_summary_provider_factory import ( diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_lazy.py b/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_lazy.py index 869f3d6b7..0131afb75 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_lazy.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_lazy.py @@ -13,19 +13,23 @@ from ._field_metadata import create_vector_metadata_from_field_meta from ._resampling import ( - generate_normalized_sample_dates, + find_intersection_of_normalized_dates, + find_union_of_normalized_dates, resample_segmented_multi_real_table, sample_segmented_multi_real_table_at_date, ) from ._table_utils import ( add_per_vector_min_max_to_table_schema_metadata, - find_intersected_dates_between_realizations, + find_intersection_of_realization_dates, find_min_max_for_numeric_table_columns, + find_union_of_realization_dates, get_per_vector_min_max_from_schema_metadata, ) from .ensemble_summary_provider import ( + DateSpan, EnsembleSummaryProvider, Frequency, + ResamplingOptions, VectorMetadata, ) @@ -264,6 +268,7 @@ def supports_resampling(self) -> bool: def dates( self, resampling_frequency: Optional[Frequency], + date_span: DateSpan = DateSpan.UNION, realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: @@ -278,14 +283,17 @@ def dates( et_filter_ms = timer.lap_ms() if resampling_frequency is not None: - unique_dates_np = table.column("DATE").unique().to_numpy() - min_raw_date = np.min(unique_dates_np) - max_raw_date = np.max(unique_dates_np) - intersected_dates = generate_normalized_sample_dates( - min_raw_date, max_raw_date, resampling_frequency - ) + if date_span == DateSpan.INTERSECTION: + date_list = find_intersection_of_normalized_dates( + table, resampling_frequency + ) + else: + date_list = find_union_of_normalized_dates(table, resampling_frequency) else: - intersected_dates = find_intersected_dates_between_realizations(table) + if date_span == DateSpan.INTERSECTION: + date_list = find_intersection_of_realization_dates(table) + else: + date_list = find_union_of_realization_dates(table) et_find_unique_ms = timer.lap_ms() @@ -296,12 +304,12 @@ def dates( f"find_unique={et_find_unique_ms}ms)" ) - return intersected_dates.astype(datetime.datetime).tolist() + return date_list.astype(datetime.datetime).tolist() def get_vectors_df( self, vector_names: Sequence[str], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: @@ -320,15 +328,22 @@ def get_vectors_df( table = table.filter(mask) et_filter_ms = timer.lap_ms() - if resampling_frequency is not None: - table = resample_segmented_multi_real_table(table, resampling_frequency) + if resampling_options is not None: + table = resample_segmented_multi_real_table( + table, resampling_options.frequency, resampling_options.common_date_span + ) + et_resample_ms = timer.lap_ms() df = table.to_pandas(timestamp_as_object=True) et_to_pandas_ms = timer.lap_ms() + actual_resampling_freq = ( + resampling_options.frequency if resampling_options else None + ) + LOGGER.debug( - f"get_vectors_df({resampling_frequency}) took: {timer.elapsed_ms()}ms (" + f"get_vectors_df({actual_resampling_freq}) took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"resample={et_resample_ms}ms, " diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_presampled.py b/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_presampled.py index 83a6837df..b65800dd9 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_presampled.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/_provider_impl_arrow_presampled.py @@ -16,13 +16,16 @@ from ._field_metadata import create_vector_metadata_from_field_meta from ._table_utils import ( add_per_vector_min_max_to_table_schema_metadata, - find_intersected_dates_between_realizations, + find_intersection_of_realization_dates, find_min_max_for_numeric_table_columns, + find_union_of_realization_dates, get_per_vector_min_max_from_schema_metadata, ) from .ensemble_summary_provider import ( + DateSpan, EnsembleSummaryProvider, Frequency, + ResamplingOptions, VectorMetadata, ) @@ -351,6 +354,7 @@ def supports_resampling(self) -> bool: def dates( self, resampling_frequency: Optional[Frequency], + date_span: DateSpan = DateSpan.UNION, realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: @@ -367,26 +371,30 @@ def dates( table = table.filter(mask) et_filter_ms = timer.lap_ms() - intersected_dates = find_intersected_dates_between_realizations(table) - et_find_unique_ms = timer.lap_ms() + if date_span == DateSpan.INTERSECTION: + found_dates = find_intersection_of_realization_dates(table) + else: + found_dates = find_union_of_realization_dates(table) + + et_find_dates_ms = timer.lap_ms() LOGGER.debug( f"dates() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " - f"find_unique={et_find_unique_ms}ms)" + f"find_dates={et_find_dates_ms}ms)" ) - return intersected_dates.astype(datetime.datetime).tolist() + return found_dates.astype(datetime.datetime).tolist() def get_vectors_df( self, vector_names: Sequence[str], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: - if resampling_frequency is not None: + if resampling_options is not None: raise ValueError("Resampling is not supported by this provider") timer = PerfTimer() diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/_resampling.py b/webviz_subsurface/_providers/ensemble_summary_provider/_resampling.py index b0d56bc87..b9aadc9a4 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/_resampling.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/_resampling.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from typing import Dict +from typing import Dict, List, Optional, Tuple import numpy as np import pyarrow as pa from ._field_metadata import is_rate_from_field_meta -from .ensemble_summary_provider import Frequency +from ._table_utils import find_min_max_date_per_realization +from .ensemble_summary_provider import DateSpan, Frequency def _truncate_day_to_monday(datetime_day: np.datetime64) -> np.datetime64: @@ -21,54 +22,169 @@ def _quarter_start_month(datetime_day: np.datetime64) -> np.datetime64: return datetime_month - (datetime_month.astype(int) % 3) -def generate_normalized_sample_dates( - min_date: np.datetime64, max_date: np.datetime64, freq: Frequency +def _generate_normalized_sample_date_range_or_minmax( + min_date: np.datetime64, + max_date: np.datetime64, + freq: Frequency, + generate_full_range: bool, ) -> np.ndarray: - """Returns array of normalized sample dates to cover the min_date to max_date - range with the specified frequency. + """Worker function to determine the normalized sample dates that will cover the + min_date to max_date interval with the specified frequency. + If generate_full_range is True, an array containing the full range of sample dates + will be returned. If False, only the min and max sample dates will be returned. The return numpy array will have sample dates with dtype datetime64[ms] """ if freq == Frequency.DAILY: start = np.datetime64(min_date, "D") stop = np.datetime64(max_date, "D") + step = 1 if stop < max_date: stop += 1 - sampledates = np.arange(start, stop + 1) + elif freq == Frequency.WEEKLY: start = _truncate_day_to_monday(np.datetime64(min_date, "D")) stop = _truncate_day_to_monday(np.datetime64(max_date, "D")) + step = 7 if start > min_date: start -= 7 if stop < max_date: stop += 7 - sampledates = np.arange(start, stop + 1, 7) + elif freq == Frequency.MONTHLY: start = np.datetime64(min_date, "M") stop = np.datetime64(max_date, "M") + step = 1 if stop < max_date: stop += 1 - sampledates = np.arange(start, stop + 1) + elif freq == Frequency.QUARTERLY: start = _quarter_start_month(min_date) stop = _quarter_start_month(max_date) + step = 3 if stop < max_date: stop += 3 - sampledates = np.arange(start, stop + 1, 3) + elif freq == Frequency.YEARLY: start = np.datetime64(min_date, "Y") stop = np.datetime64(max_date, "Y") + step = 1 if stop < max_date: stop += 1 - sampledates = np.arange(start, stop + 1) + else: raise NotImplementedError( f"Currently not supporting resampling to frequency {freq}." ) - sampledates = sampledates.astype("datetime64[ms]") + if generate_full_range: + sampledates: np.ndarray = np.arange(start, stop + 1, step) + else: + sampledates = np.array([start, stop]) + + return sampledates.astype("datetime64[ms]") + + +def generate_normalized_sample_dates( + min_raw_date: np.datetime64, max_raw_date: np.datetime64, freq: Frequency +) -> np.ndarray: + """Returns array of normalized sample dates to cover the min_raw_date to + max_raw_date interval with the specified frequency. + The return numpy array will have sample dates with dtype datetime64[ms] + """ + return _generate_normalized_sample_date_range_or_minmax( + min_date=min_raw_date, + max_date=max_raw_date, + freq=freq, + generate_full_range=True, + ) + + +def get_normalized_min_max_sample_date( + min_raw_date: np.datetime64, max_raw_date: np.datetime64, freq: Frequency +) -> Tuple[np.datetime64, np.datetime64]: + """Returns min and max normalized sample dates to cover the min_raw_date to + max_raw_date range with the specified frequency. + The return tuple will have min and max dates with dtype datetime64[ms] + """ + minmax_arr = _generate_normalized_sample_date_range_or_minmax( + min_date=min_raw_date, + max_date=max_raw_date, + freq=freq, + generate_full_range=False, + ) + + if len(minmax_arr) != 2: + raise ValueError("Wrong number of array elements in minmax_arr") + + return (minmax_arr[0], minmax_arr[1]) + + +def calc_intersection_of_normalized_date_intervals( + raw_date_intervals: List[Tuple[np.datetime64, np.datetime64]], freq: Frequency +) -> Optional[Tuple[np.datetime64, np.datetime64]]: + """Returns the intersection of the normalized version of all the intervals specified + in raw_date_intervals. + Note that each interval in raw_date_intervals will be normalized using the specified + frequency before being used to calculate the intersection. + """ + + if not raw_date_intervals: + return None + + first_raw_interval = raw_date_intervals[0] + res_start, res_end = get_normalized_min_max_sample_date( + first_raw_interval[0], first_raw_interval[1], freq + ) + + for raw_interval in raw_date_intervals[1:]: + start, end = get_normalized_min_max_sample_date( + raw_interval[0], raw_interval[1], freq + ) + + if start > res_end or end < res_start: + return None + + res_start = max(res_start, start) + res_end = min(res_end, end) + + return (res_start, res_end) - return sampledates + +def find_union_of_normalized_dates(table: pa.Table, frequency: Frequency) -> np.ndarray: + """Generates list of normalized sample dates, with the specified frequency, that + covers the union of all dates in the table. + """ + unique_dates_np = table.column("DATE").unique().to_numpy() + if len(unique_dates_np) == 0: + return np.empty(0, dtype=np.datetime64) + + min_raw_date = np.min(unique_dates_np) + max_raw_date = np.max(unique_dates_np) + return generate_normalized_sample_dates(min_raw_date, max_raw_date, frequency) + + +def find_intersection_of_normalized_dates( + table: pa.Table, frequency: Frequency +) -> np.ndarray: + """Generates list of normalized sample dates, with the specified frequency, that + is the intersection of all the the normalized per-realization sample intervals. + """ + # First find the raw date intervals for each realization. + per_real_raw_intervals = find_min_max_date_per_realization(table) + + # Then calculate the intersection between the normalized versions of these intervals + intersection_interval = calc_intersection_of_normalized_date_intervals( + per_real_raw_intervals, frequency + ) + if not intersection_interval: + return np.empty(0, dtype=np.datetime64) + + return generate_normalized_sample_dates( + intersection_interval[0], + intersection_interval[1], + frequency, + ) def interpolate_backfill( @@ -144,52 +260,118 @@ def resample_single_real_table(table: pa.Table, freq: Frequency) -> pa.Table: return ret_table -@dataclass -class RealInterpolationInfo: - raw_dates_np: np.ndarray - raw_dates_np_as_uint: np.ndarray - sample_dates_np: np.ndarray - sample_dates_np_as_uint: np.ndarray +class InterpolationHelper: + """Helper class for tracking and caching of intermediate data needed when doing + resampling of multi-realization table data. + Assumes that table contains both a REAL and a DATE column. + Also assumes that the table is segmented on REAL (so that all rows from a single + realization are contiguous) and within each REAL segment, it must be sorted on DATE. + """ + @dataclass(frozen=True) + class RowSegment: + start_row: int + row_count: int + + @dataclass(frozen=True) + class DateInfo: + raw_dates_np_as_uint: np.ndarray + sample_dates_np: np.ndarray + sample_dates_np_as_uint: np.ndarray + + def __init__( + self, table: pa.Table, freq: Frequency, common_date_span: Optional[DateSpan] + ) -> None: + real_arr_np = table.column("REAL").to_numpy() + unique_reals, first_occurrence_idx, real_counts = np.unique( + real_arr_np, return_index=True, return_counts=True + ) -def _extract_real_interpolation_info( - table: pa.Table, start_row_idx: int, row_count: int, freq: Frequency -) -> RealInterpolationInfo: + self._table = table + self._frequency = freq - real_dates = table["DATE"].slice(start_row_idx, row_count).to_numpy() + self._real_row_segment_dict: Dict[int, InterpolationHelper.RowSegment] = { + real: InterpolationHelper.RowSegment( + start_row=first_occurrence_idx[idx], row_count=real_counts[idx] + ) + for idx, real in enumerate(unique_reals) + } - min_raw_date = np.min(real_dates) - max_raw_date = np.max(real_dates) - sample_dates = generate_normalized_sample_dates(min_raw_date, max_raw_date, freq) + self._real_date_info_dict: Dict[int, InterpolationHelper.DateInfo] = {} - return RealInterpolationInfo( - raw_dates_np=real_dates, - raw_dates_np_as_uint=real_dates.astype(np.uint64), - sample_dates_np=sample_dates, - sample_dates_np_as_uint=sample_dates.astype(np.uint64), - ) + self.shared_sample_dates_np: Optional[np.ndarray] = None + self.shared_sample_dates_np_as_uint: Optional[np.ndarray] = None + if common_date_span is not None: + if common_date_span == DateSpan.INTERSECTION: + shared_dates = find_intersection_of_normalized_dates(table, freq) + else: + shared_dates = find_union_of_normalized_dates(table, freq) + + self.shared_sample_dates_np = shared_dates + self.shared_sample_dates_np_as_uint = shared_dates.astype(np.uint64) + + # Try and prime the cache up front + # for real in unique_reals: + # self.real_date_arrays(real) + + def unique_reals(self) -> List[int]: + return list(self._real_row_segment_dict) + + def date_info(self, real: int) -> DateInfo: + dateinfo = self._real_date_info_dict.get(real) + if not dateinfo: + seg = self._real_row_segment_dict[real] + dates = self._table["DATE"].slice(seg.start_row, seg.row_count).to_numpy() + + if ( + self.shared_sample_dates_np is not None + and self.shared_sample_dates_np_as_uint is not None + ): + dateinfo = InterpolationHelper.DateInfo( + raw_dates_np_as_uint=dates.astype(np.uint64), + sample_dates_np=self.shared_sample_dates_np, + sample_dates_np_as_uint=self.shared_sample_dates_np_as_uint, + ) + else: + min_raw_date = np.min(dates) + max_raw_date = np.max(dates) + sample_dates = generate_normalized_sample_dates( + min_raw_date, max_raw_date, self._frequency + ) + + dateinfo = InterpolationHelper.DateInfo( + raw_dates_np_as_uint=dates.astype(np.uint64), + sample_dates_np=sample_dates, + sample_dates_np_as_uint=sample_dates.astype(np.uint64), + ) + + self._real_date_info_dict[real] = dateinfo + + return dateinfo + + def row_segment(self, real: int) -> Tuple[int, int]: + segment = self._real_row_segment_dict[real] + return (segment.start_row, segment.row_count) -def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa.Table: +def resample_segmented_multi_real_table( + table: pa.Table, freq: Frequency, common_date_span: Optional[DateSpan] +) -> pa.Table: """Resample table containing multiple realizations. The table must contain both a REAL and a DATE column. - The table must be segmented on REAL (so that all rows from a single - realization are contiguous) and within each REAL segment, it must be - sorted on DATE. + The table must be segmented on REAL (so that all rows from a single realization are + contiguous) and within each REAL segment, it must be sorted on DATE. The segmentation is needed since interpolations must be done per realization and we utilize slicing on rows for speed. """ + # pylint: disable=too-many-locals - real_arr_np = table.column("REAL").to_numpy() - unique_reals, first_occurrence_idx, real_counts = np.unique( - real_arr_np, return_index=True, return_counts=True - ) + helper = InterpolationHelper(table, freq, common_date_span) + unique_reals = helper.unique_reals() output_columns_dict: Dict[str, pa.ChunkedArray] = {} - real_interpolation_info_dict: Dict[int, RealInterpolationInfo] = {} - for colname in table.schema.names: if colname in ["DATE", "REAL"]: continue @@ -198,16 +380,9 @@ def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa. raw_whole_numpy_arr = table.column(colname).to_numpy() vec_arr_list = [] - for i, real in enumerate(unique_reals): - start_row_idx = first_occurrence_idx[i] - row_count = real_counts[i] - - rii = real_interpolation_info_dict.get(real) - if not rii: - rii = _extract_real_interpolation_info( - table, start_row_idx, row_count, freq - ) - real_interpolation_info_dict[real] = rii + for real in unique_reals: + start_row_idx, row_count = helper.row_segment(real) + dateinfo = helper.date_info(real) raw_numpy_arr = raw_whole_numpy_arr[ start_row_idx : start_row_idx + row_count @@ -215,20 +390,20 @@ def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa. if is_rate: inter = interpolate_backfill( - rii.sample_dates_np_as_uint, - rii.raw_dates_np_as_uint, + dateinfo.sample_dates_np_as_uint, + dateinfo.raw_dates_np_as_uint, raw_numpy_arr, 0, 0, ) else: inter = np.interp( - rii.sample_dates_np_as_uint, - rii.raw_dates_np_as_uint, + dateinfo.sample_dates_np_as_uint, + dateinfo.raw_dates_np_as_uint, raw_numpy_arr, ) - arr_length = len(rii.sample_dates_np_as_uint) + arr_length = len(dateinfo.sample_dates_np_as_uint) assert arr_length == len(inter) vec_arr_list.append(inter) @@ -238,9 +413,9 @@ def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa. date_arr_list = [] real_arr_list = [] for real in unique_reals: - rii = real_interpolation_info_dict[real] - arr_length = len(rii.sample_dates_np) - date_arr_list.append(rii.sample_dates_np) + dateinfo = helper.date_info(real) + arr_length = len(dateinfo.sample_dates_np) + date_arr_list.append(dateinfo.sample_dates_np) real_arr_list.append(np.full(arr_length, real)) output_columns_dict["DATE"] = pa.chunked_array(date_arr_list) diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/_table_utils.py b/webviz_subsurface/_providers/ensemble_summary_provider/_table_utils.py index 4fd834f2c..4b02e473a 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/_table_utils.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/_table_utils.py @@ -1,5 +1,5 @@ import json -from typing import Dict +from typing import Dict, List, Tuple import numpy as np import pyarrow as pa @@ -47,10 +47,12 @@ def get_per_vector_min_max_from_schema_metadata(schema: pa.Schema) -> Dict[str, return webviz_meta[_PER_VECTOR_MIN_MAX_KEY] -def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray: - """Find the intersection of dates present in all the realizations +def find_intersection_of_realization_dates(table: pa.Table) -> np.ndarray: + """Find the intersection of dates present in all the realizations. The input table must contain both REAL and DATE columns, but this function makes - no assumptions about sorting of either column""" + no assumptions about sorting of either column. + The returned array of dates will always be sorted. + """ unique_reals = table.column("REAL").unique().to_numpy() @@ -60,8 +62,9 @@ def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray: real_mask = pc.is_in(table["REAL"], value_set=pa.array([real])) dates_in_real = table.filter(real_mask).column("DATE").unique().to_numpy() if date_intersection is None: - date_intersection = dates_in_real + date_intersection = np.sort(dates_in_real) else: + # The intersection returned by intersect1d() is sorted date_intersection = np.intersect1d( date_intersection, dates_in_real, assume_unique=True ) @@ -70,3 +73,39 @@ def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray: return date_intersection return np.empty(0, dtype=np.datetime64) + + +def find_union_of_realization_dates(table: pa.Table) -> np.ndarray: + """Find the union of dates in all the realizations. + The input table must contain a DATE column, but no previous sorting is required. + The returned array of dates is always sorted. + """ + + unique_dates = table.column("DATE").unique().to_numpy() + return np.sort(unique_dates) + + +def find_min_max_date_per_realization( + table: pa.Table, +) -> List[Tuple[np.datetime64, np.datetime64]]: + """Find the min and max dates within each realization in the table.""" + + unique_reals = table.column("REAL").unique().to_numpy() + + minmax_np_list: List[Tuple[np.datetime64, np.datetime64]] = [] + + for real in unique_reals: + # pylint: disable=no-member + real_table = table.filter(pc.equal(table["REAL"], real)) + minmax = pc.min_max(real_table["DATE"]) + + # Make sure we have pa.timestamp("ms") as datatype for the actual scalar values + # in the returned StructScalar before we convert to numpy + minmax_np_list.append( + ( + np.datetime64(minmax.get("min").cast(pa.timestamp("ms")).value, "ms"), + np.datetime64(minmax.get("max").cast(pa.timestamp("ms")).value, "ms"), + ), + ) + + return minmax_np_list diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/dev_compare_fmu_to_lazy_provider.py b/webviz_subsurface/_providers/ensemble_summary_provider/dev_compare_fmu_to_lazy_provider.py index 48fe5a0b5..c5b75541e 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/dev_compare_fmu_to_lazy_provider.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/dev_compare_fmu_to_lazy_provider.py @@ -13,7 +13,7 @@ import pandas as pd from fmu.ensemble import ScratchEnsemble -from .ensemble_summary_provider import Frequency +from .ensemble_summary_provider import DateSpan, Frequency, ResamplingOptions from .ensemble_summary_provider_factory import ( EnsembleSummaryProvider, EnsembleSummaryProviderFactory, @@ -150,14 +150,14 @@ def _load_smry_dataframe_using_ecl2df( def _compare_reference_df_to_provider_get_vectors_df( reference_df: pd.DataFrame, provider: EnsembleSummaryProvider, - frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], ) -> None: reference_df = reference_df.reset_index(drop=True) # print(ref_df) print("## Getting data for all vectors from provider...") - provider_df = provider.get_vectors_df(provider.vector_names(), frequency) + provider_df = provider.get_vectors_df(provider.vector_names(), resampling_options) provider_df.sort_values(by=["REAL", "DATE"], inplace=True) provider_df.reset_index(drop=True, inplace=True) # print(provider_df) @@ -252,6 +252,7 @@ def main() -> None: root_storage_dir = Path("/home/sigurdp/buf/webviz_storage_dir") + rel_file_pattern = "share/results/unsmry/*.arrow" ensemble_path = "../webviz-subsurface-testdata/01_drogon_ahm/realization-*/iter-0" # ensemble_path = ( # "../webviz-subsurface-testdata/01_drogon_design/realization-*/iter-0" @@ -274,10 +275,10 @@ def main() -> None: factory = EnsembleSummaryProviderFactory( root_storage_dir, allow_storage_writes=True ) - provider = factory.create_from_arrow_unsmry_lazy( - ens_path=ensemble_path, rel_file_pattern="share/results/unsmry/*.arrow" - ) - # provider = factory.create_from_arrow_unsmry_presampled(ensemble_path, frequency) + provider = factory.create_from_arrow_unsmry_lazy(ensemble_path, rel_file_pattern) + # provider = factory.create_from_arrow_unsmry_presampled( + # ensemble_path, rel_file_pattern, frequency + # ) print("## Loading data into reference DataFrame...") # Note that for version 2.13.0 and earlier of ecl, loading via FMU will not give the @@ -286,9 +287,15 @@ def main() -> None: reference_df = _load_smry_dataframe_using_fmu(ensemble_path, frequency) print("## Comparing get_vectors()...") - resampling_frequency = frequency if provider.supports_resampling() else None + + resampling_options: Optional[ResamplingOptions] = None + if provider.supports_resampling(): + resampling_options = ResamplingOptions( + frequency=frequency, common_date_span=DateSpan.UNION + ) + _compare_reference_df_to_provider_get_vectors_df( - reference_df, provider, resampling_frequency + reference_df, provider, resampling_options ) print("## Comparing get_vectors_for date()...") diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/dev_provider_perf_testing.py b/webviz_subsurface/_providers/ensemble_summary_provider/dev_provider_perf_testing.py index 889e53a88..bdfe8e5ff 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/dev_provider_perf_testing.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/dev_provider_perf_testing.py @@ -4,13 +4,17 @@ from pathlib import Path from typing import Optional -from .ensemble_summary_provider import EnsembleSummaryProvider, Frequency +from .ensemble_summary_provider import ( + EnsembleSummaryProvider, + Frequency, + ResamplingOptions, +) from .ensemble_summary_provider_factory import EnsembleSummaryProviderFactory def _get_n_vectors_one_by_one_all_realizations( provider: EnsembleSummaryProvider, - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], num_vectors: int, ) -> None: @@ -18,10 +22,12 @@ def _get_n_vectors_one_by_one_all_realizations( num_vectors = min(num_vectors, len(all_vectors)) vectors_to_get = all_vectors[0:num_vectors] + freq_in_use = resampling_options.frequency if resampling_options else None + print("## ------------------") print( f"## entering _get_n_vectors_one_by_one_all_realizations(" - f"{resampling_frequency}, {num_vectors}) ..." + f"{freq_in_use}, {num_vectors}) ..." ) start_tim = time.perf_counter() @@ -29,7 +35,7 @@ def _get_n_vectors_one_by_one_all_realizations( accum_rows = 0 accum_cols = 0 for vec_name in vectors_to_get: - df = provider.get_vectors_df([vec_name], resampling_frequency) + df = provider.get_vectors_df([vec_name], resampling_options) accum_rows += df.shape[0] accum_cols += df.shape[1] @@ -52,7 +58,7 @@ def _get_n_vectors_one_by_one_all_realizations( def _get_n_vectors_in_batch_all_realizations( provider: EnsembleSummaryProvider, - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], num_vectors: int, ) -> None: @@ -60,15 +66,17 @@ def _get_n_vectors_in_batch_all_realizations( num_vectors = min(num_vectors, len(all_vectors)) vectors_to_get = all_vectors[0:num_vectors] + freq_in_use = resampling_options.frequency if resampling_options else None + print("## ------------------") print( f"## entering _get_n_vectors_in_batch_all_realizations(" - f"{resampling_frequency}, {num_vectors}) ..." + f"{freq_in_use}, {num_vectors}) ..." ) start_tim = time.perf_counter() - df = provider.get_vectors_df(vectors_to_get, resampling_frequency) + df = provider.get_vectors_df(vectors_to_get, resampling_options) elapsed_time_ms = 1000 * (time.perf_counter() - start_tim) @@ -176,15 +184,21 @@ def _run_perf_tests( print() + resampling_options: Optional[ResamplingOptions] = None + if resampling_frequency: + resampling_options = ResamplingOptions( + frequency=resampling_frequency, common_date_span=None + ) + _get_meta_for_n_vectors_one_by_one(provider, 10) _get_vector_names_filtered_by_value(provider) - _get_n_vectors_one_by_one_all_realizations(provider, resampling_frequency, 1) - _get_n_vectors_one_by_one_all_realizations(provider, resampling_frequency, 50) + _get_n_vectors_one_by_one_all_realizations(provider, resampling_options, 1) + _get_n_vectors_one_by_one_all_realizations(provider, resampling_options, 50) - _get_n_vectors_in_batch_all_realizations(provider, resampling_frequency, 50) - _get_n_vectors_in_batch_all_realizations(provider, resampling_frequency, 99999) + _get_n_vectors_in_batch_all_realizations(provider, resampling_options, 50) + _get_n_vectors_in_batch_all_realizations(provider, resampling_options, 99999) num_vecs = 100 _get_n_vectors_for_date_all_realizations( diff --git a/webviz_subsurface/_providers/ensemble_summary_provider/ensemble_summary_provider.py b/webviz_subsurface/_providers/ensemble_summary_provider/ensemble_summary_provider.py index 493b025f9..7283a376b 100644 --- a/webviz_subsurface/_providers/ensemble_summary_provider/ensemble_summary_provider.py +++ b/webviz_subsurface/_providers/ensemble_summary_provider/ensemble_summary_provider.py @@ -22,6 +22,32 @@ def from_string_value(cls, value: str) -> Optional["Frequency"]: return None +class DateSpan(Enum): + INTERSECTION = "intersection" + UNION = "union" + + +@dataclass(frozen=True) +class ResamplingOptions: + """Specifies resampling options, most notably the resampling frequency. + Can also specify a `common_date_span` that will influence which dates get included + in the returned time series data. + * DateSpan.INTERSECTION - truncates the returned range of dates so that all returned + realizations have the same date range. The returned range of dates will be + the intersection of the date range available per requested realization. + * DateSpan.UNION - extends the returned range of dates so that all realizations + have same date range and the same dates. The returned range of dates will be the + union of the date ranges available per requested realization. Vector values will + be extrapolated. + * None - each returned realization will contain dates according to the requested + frequency, but no effort will be made to truncate or expand the total date range + in order to align start and end dates between realizations. + """ + + frequency: Frequency + common_date_span: Optional[DateSpan] = None + + @dataclass(frozen=True) class VectorMetadata: unit: str @@ -67,7 +93,8 @@ def vector_metadata(self, vector_name: str) -> Optional[VectorMetadata]: def supports_resampling(self) -> bool: """Returns True if this provider supports resampling, otherwise False. A provider that doesn't support resampling will only accept None as value for - the resampling_frequency parameter in `dates()` and `get_vectors_df()`. + the resampling_frequency parameter in `dates()` and the `resampling_options` in + `get_vectors_df()`. """ ... @@ -75,14 +102,11 @@ def supports_resampling(self) -> bool: def dates( self, resampling_frequency: Optional[Frequency], + date_span: DateSpan = DateSpan.UNION, realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: - """Returns the intersection of available dates. - Note that when resampling_frequency is None, the pure intersection of the - stored raw dates will be returned. Thus the returned list of dates will not include - dates from long running realizations. - For other resampling frequencies, the date range will be expanded to cover the entire - time range of all the requested realizations before computing the resampled dates. + """Returns the intersection or union of available dates for the specified + realizations depending on the specified value `date_span`. """ ... @@ -90,15 +114,16 @@ def dates( def get_vectors_df( self, vector_names: Sequence[str], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: """Returns a Pandas DataFrame with data for the vectors specified in `vector_names.` - For a provider that supports resampling, the `resampling_frequency` parameter - controls the sampling frequency of the returned data. If `resampling_frequency` is - None, the data will be returned with full/raw resolution. - For a provider that does not support resampling, the `resampling_frequency` parameter + For a provider that supports resampling, the `resampling_options` parameter + object controls the sampling frequency of the returned data and the number of + dates to return for each realization. + If `resampling_options` is None, the data will be returned with full/raw resolution. + For a provider that does not support resampling, the `resampling_options` parameter must always be None, otherwise an exception will be raised. The returned DataFrame will always contain a 'DATE' and 'REAL' column in addition diff --git a/webviz_subsurface/_utils/vector_calculator.py b/webviz_subsurface/_utils/vector_calculator.py index 1f728309d..6b7790a09 100644 --- a/webviz_subsurface/_utils/vector_calculator.py +++ b/webviz_subsurface/_utils/vector_calculator.py @@ -14,7 +14,7 @@ VectorDefinition, ) -from webviz_subsurface._providers import EnsembleSummaryProvider, Frequency +from webviz_subsurface._providers import EnsembleSummaryProvider, ResamplingOptions from .vector_selector import ( add_vector_to_vector_selector_data, @@ -187,7 +187,7 @@ def expressions_from_config( "variableVectorMap": variable_vector_map_from_dict( expressions[expression]["variableVectorMap"] ), - "isValid": False, # Set False and validate in seperate operation + "isValid": False, # Set False and validate in separate operation "isDeletable": False, } if "description" in expressions[expression]: @@ -284,7 +284,7 @@ def create_calculated_vector_df( expression: ExpressionInfo, provider: EnsembleSummaryProvider, realizations: Optional[Sequence[int]], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], ) -> pd.DataFrame: """Create dataframe with calculated vector from expression @@ -304,9 +304,7 @@ def create_calculated_vector_df( vector_names = list(variable_vector_dict.values()) # Retrieve data for vectors in expression - vectors_df = provider.get_vectors_df( - vector_names, resampling_frequency, realizations - ) + vectors_df = provider.get_vectors_df(vector_names, resampling_options, realizations) values: Dict[str, np.ndarray] = {} for variable, vector in variable_vector_dict.items(): diff --git a/webviz_subsurface/plugins/_simulation_time_series/_callbacks.py b/webviz_subsurface/plugins/_simulation_time_series/_callbacks.py index cdf51f905..b4715a294 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/_callbacks.py +++ b/webviz_subsurface/plugins/_simulation_time_series/_callbacks.py @@ -16,7 +16,7 @@ VectorDefinition, ) -from webviz_subsurface._providers import Frequency +from webviz_subsurface._providers import DateSpan, Frequency, ResamplingOptions from webviz_subsurface._utils.formatting import printable_int_list from webviz_subsurface._utils.unique_theming import unique_colors from webviz_subsurface._utils.vector_calculator import ( @@ -216,6 +216,24 @@ def _update_graph( ): raise PreventUpdate + # Create resampling options + date_span = ( + DateSpan.UNION + if visualization + in [ + VisualizationOptions.FANCHART, + VisualizationOptions.STATISTICS, + VisualizationOptions.STATISTICS_AND_REALIZATIONS, + ] + or relative_date + else None + ) + resampling_options = ( + ResamplingOptions(resampling_frequency, date_span) + if resampling_frequency + else None + ) + # Create dict of derived vectors accessors for selected ensembles derived_vectors_accessors: Dict[ str, DerivedVectorsAccessor @@ -225,7 +243,7 @@ def _update_graph( provider_set=input_provider_set, expressions=selected_expressions, delta_ensembles=delta_ensembles, - resampling_frequency=resampling_frequency, + resampling_options=resampling_options, relative_date=relative_date, ) @@ -528,6 +546,24 @@ def _user_download_data( else datetime_utils.from_str(relative_date_value) ) + # Create resampling options + date_span = ( + DateSpan.UNION + if visualization + in [ + VisualizationOptions.FANCHART, + VisualizationOptions.STATISTICS, + VisualizationOptions.STATISTICS_AND_REALIZATIONS, + ] + or relative_date + else None + ) + resampling_options = ( + ResamplingOptions(resampling_frequency, date_span) + if resampling_frequency + else None + ) + # Create dict of derived vectors accessors for selected ensembles derived_vectors_accessors: Dict[ str, DerivedVectorsAccessor @@ -537,7 +573,7 @@ def _user_download_data( provider_set=input_provider_set, expressions=selected_expressions, delta_ensembles=delta_ensembles, - resampling_frequency=resampling_frequency, + resampling_options=resampling_options, relative_date=relative_date, ) diff --git a/webviz_subsurface/plugins/_simulation_time_series/types/derived_delta_ensemble_vectors_accessor_impl.py b/webviz_subsurface/plugins/_simulation_time_series/types/derived_delta_ensemble_vectors_accessor_impl.py index a9572ac87..8e6e6e439 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/types/derived_delta_ensemble_vectors_accessor_impl.py +++ b/webviz_subsurface/plugins/_simulation_time_series/types/derived_delta_ensemble_vectors_accessor_impl.py @@ -4,7 +4,11 @@ import pandas as pd from webviz_subsurface_components import ExpressionInfo -from webviz_subsurface._providers import EnsembleSummaryProvider, Frequency +from webviz_subsurface._providers import ( + DateSpan, + EnsembleSummaryProvider, + ResamplingOptions, +) from webviz_subsurface._utils.dataframe_utils import make_date_column_datetime_object from webviz_subsurface._utils.vector_calculator import ( create_calculated_vector_df, @@ -42,7 +46,7 @@ def __init__( provider_pair: Tuple[EnsembleSummaryProvider, EnsembleSummaryProvider], vectors: List[str], expressions: Optional[List[ExpressionInfo]] = None, - resampling_frequency: Optional[Frequency] = None, + resampling_options: Optional[ResamplingOptions] = None, relative_date: Optional[datetime.datetime] = None, ) -> None: if len(provider_pair) != 2: @@ -95,20 +99,25 @@ def __init__( else [] ) - # Set resampling frequency - self._resampling_frequency = ( - resampling_frequency + self._relative_date = relative_date + + # Set resampling options + self._resampling_options = ( + resampling_options if self._provider_a.supports_resampling() and self._provider_b.supports_resampling() else None ) - self._relative_date = relative_date + # Make date span union if calculating data relative to date + if self._relative_date and self._resampling_options: + self._resampling_options = ResamplingOptions( + self._resampling_options.frequency, common_date_span=DateSpan.UNION + ) def __create_delta_ensemble_vectors_df( self, vector_names: List[str], - resampling_frequency: Optional[Frequency], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: """ @@ -122,7 +131,6 @@ def __create_delta_ensemble_vectors_df( `Input:` * vector_names: List[str] - List of vector names to get data for - * resampling_frequency: Optional[Frequency] - Optional resampling frequency * realizations: Optional[Sequence[int]] - Optional sequence of realization numbers for vectors @@ -138,10 +146,10 @@ def __create_delta_ensemble_vectors_df( # NOTE: index order ["DATE","REAL"] to obtain column order when # performing reset_index() later ensemble_a_vectors_df = self._provider_a.get_vectors_df( - vector_names, resampling_frequency, realizations + vector_names, self._resampling_options, realizations ).set_index(["DATE", "REAL"]) ensemble_b_vectors_df = self._provider_b.get_vectors_df( - vector_names, resampling_frequency, realizations + vector_names, self._resampling_options, realizations ).set_index(["DATE", "REAL"]) # Reset index, sort values by "REAL" and thereafter by "DATE" to @@ -187,12 +195,12 @@ def get_provider_vectors_df( if self._relative_date: return dataframe_utils.create_relative_to_date_df( self.__create_delta_ensemble_vectors_df( - self._provider_vectors, self._resampling_frequency, realizations + self._provider_vectors, realizations ), self._relative_date, ) return self.__create_delta_ensemble_vectors_df( - self._provider_vectors, self._resampling_frequency, realizations + self._provider_vectors, realizations ) def create_per_interval_and_per_day_vectors_df( @@ -234,7 +242,7 @@ def create_per_interval_and_per_day_vectors_df( cumulative_vector_names = list(sorted(set(cumulative_vector_names))) vectors_df = self.__create_delta_ensemble_vectors_df( - cumulative_vector_names, self._resampling_frequency, realizations + cumulative_vector_names, realizations ) per_interval_and_per_day_vectors_df = pd.DataFrame() @@ -296,10 +304,10 @@ def create_calculated_vectors_df( provider_b_calculated_vectors_df = pd.DataFrame() for expression in self._vector_calculator_expressions: provider_a_calculated_vector_df = create_calculated_vector_df( - expression, self._provider_a, realizations, self._resampling_frequency + expression, self._provider_a, realizations, self._resampling_options ) provider_b_calculated_vector_df = create_calculated_vector_df( - expression, self._provider_b, realizations, self._resampling_frequency + expression, self._provider_b, realizations, self._resampling_options ) if ( diff --git a/webviz_subsurface/plugins/_simulation_time_series/types/derived_ensemble_vectors_accessor_impl.py b/webviz_subsurface/plugins/_simulation_time_series/types/derived_ensemble_vectors_accessor_impl.py index 67e9c6a1a..c51fcf497 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/types/derived_ensemble_vectors_accessor_impl.py +++ b/webviz_subsurface/plugins/_simulation_time_series/types/derived_ensemble_vectors_accessor_impl.py @@ -4,7 +4,11 @@ import pandas as pd from webviz_subsurface_components import ExpressionInfo -from webviz_subsurface._providers import EnsembleSummaryProvider, Frequency +from webviz_subsurface._providers import ( + DateSpan, + EnsembleSummaryProvider, + ResamplingOptions, +) from webviz_subsurface._utils.vector_calculator import ( create_calculated_vector_df, get_selected_expressions, @@ -41,7 +45,7 @@ def __init__( provider: EnsembleSummaryProvider, vectors: List[str], expressions: Optional[List[ExpressionInfo]] = None, - resampling_frequency: Optional[Frequency] = None, + resampling_options: Optional[ResamplingOptions] = None, relative_date: Optional[datetime.datetime] = None, ) -> None: # Initialize base class @@ -63,10 +67,17 @@ def __init__( if expressions is not None else [] ) - self._resampling_frequency = ( - resampling_frequency if self._provider.supports_resampling() else None - ) + self._relative_date = relative_date + self._resampling_options = ( + resampling_options if self._provider.supports_resampling() else None + ) + + # Make date span union if calculating data relative to date + if self._relative_date and self._resampling_options: + self._resampling_options = ResamplingOptions( + self._resampling_options.frequency, common_date_span=DateSpan.UNION + ) def has_provider_vectors(self) -> bool: return len(self._provider_vectors) > 0 @@ -89,12 +100,12 @@ def get_provider_vectors_df( if self._relative_date: return dataframe_utils.create_relative_to_date_df( self._provider.get_vectors_df( - self._provider_vectors, self._resampling_frequency, realizations + self._provider_vectors, self._resampling_options, realizations ), self._relative_date, ) return self._provider.get_vectors_df( - self._provider_vectors, self._resampling_frequency, realizations + self._provider_vectors, self._resampling_options, realizations ) def create_per_interval_and_per_day_vectors_df( @@ -136,7 +147,7 @@ def create_per_interval_and_per_day_vectors_df( cumulative_vector_names = list(sorted(set(cumulative_vector_names))) vectors_df = self._provider.get_vectors_df( - cumulative_vector_names, self._resampling_frequency, realizations + cumulative_vector_names, self._resampling_options, realizations ) per_interval_and_per_day_vectors_df = pd.DataFrame() @@ -191,7 +202,7 @@ def create_calculated_vectors_df( calculated_vectors_df = pd.DataFrame() for expression in self._vector_calculator_expressions: calculated_vector_df = create_calculated_vector_df( - expression, self._provider, realizations, self._resampling_frequency + expression, self._provider, realizations, self._resampling_options ) if calculated_vectors_df.empty: calculated_vectors_df = calculated_vector_df diff --git a/webviz_subsurface/plugins/_simulation_time_series/types/provider_set.py b/webviz_subsurface/plugins/_simulation_time_series/types/provider_set.py index 272e76d0e..a48295f31 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/types/provider_set.py +++ b/webviz_subsurface/plugins/_simulation_time_series/types/provider_set.py @@ -3,6 +3,7 @@ from typing import Dict, ItemsView, List, Optional, Sequence, Set from webviz_subsurface._providers import ( + DateSpan, EnsembleSummaryProvider, EnsembleSummaryProviderFactory, Frequency, @@ -110,7 +111,7 @@ def all_dates( # TODO: Adjust when providers are updated! dates_union: Set[datetime.datetime] = set() for provider in self.all_providers(): - _dates = set(provider.dates(resampling_frequency, None)) + _dates = set(provider.dates(resampling_frequency, DateSpan.UNION, None)) dates_union.update(_dates) return list(sorted(dates_union)) diff --git a/webviz_subsurface/plugins/_simulation_time_series/utils/derived_ensemble_vectors_accessor_utils.py b/webviz_subsurface/plugins/_simulation_time_series/utils/derived_ensemble_vectors_accessor_utils.py index 7a74ffbfc..69edf09f4 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/utils/derived_ensemble_vectors_accessor_utils.py +++ b/webviz_subsurface/plugins/_simulation_time_series/utils/derived_ensemble_vectors_accessor_utils.py @@ -3,7 +3,7 @@ from webviz_subsurface_components import ExpressionInfo -from webviz_subsurface._providers import Frequency +from webviz_subsurface._providers import ResamplingOptions from ..types import ( DeltaEnsemble, @@ -25,7 +25,7 @@ def create_derived_vectors_accessor_dict( provider_set: ProviderSet, expressions: List[ExpressionInfo], delta_ensembles: List[DeltaEnsemble], - resampling_frequency: Optional[Frequency], + resampling_options: Optional[ResamplingOptions], relative_date: Optional[datetime.datetime], ) -> Dict[str, DerivedVectorsAccessor]: """Create dictionary with ensemble name as key and derived vectors accessor @@ -44,8 +44,10 @@ def create_derived_vectors_accessor_dict( * provider_set: ProviderSet - set of EnsembleSummaryProviders to obtain vector data * expressions: List[ExpressionInfo] - list of expressions for calculating vectors * delta_ensembles: List[DeltaEnsemble] - list of created delta ensembles - * resampling_frequency: Optional[Frequency] - Resampling frequency setting for + * resampling_options: Optional[ResamplingOptions] - Resampling options setting for EnsembleSummaryProviders + * relative_data: Optional[datetime.datetime] - Relative date to subtracted from respective + vector data. `Return:` * Dict[str, DerivedVectorsAccessor] - dictionary with ensemble name as key and @@ -64,7 +66,7 @@ def create_derived_vectors_accessor_dict( provider=provider_set.provider(ensemble), vectors=vectors, expressions=expressions, - resampling_frequency=resampling_frequency, + resampling_options=resampling_options, relative_date=relative_date, ) elif ( @@ -83,7 +85,7 @@ def create_derived_vectors_accessor_dict( provider_pair=provider_pair, vectors=vectors, expressions=expressions, - resampling_frequency=resampling_frequency, + resampling_options=resampling_options, relative_date=relative_date, ) diff --git a/webviz_subsurface/plugins/_simulation_time_series/utils/history_vectors.py b/webviz_subsurface/plugins/_simulation_time_series/utils/history_vectors.py index 772d4747f..408718b4a 100644 --- a/webviz_subsurface/plugins/_simulation_time_series/utils/history_vectors.py +++ b/webviz_subsurface/plugins/_simulation_time_series/utils/history_vectors.py @@ -3,7 +3,11 @@ import pandas as pd from webviz_subsurface._abbreviations.reservoir_simulation import historical_vector -from webviz_subsurface._providers import EnsembleSummaryProvider, Frequency +from webviz_subsurface._providers import ( + EnsembleSummaryProvider, + Frequency, + ResamplingOptions, +) def create_history_vectors_df( @@ -60,7 +64,14 @@ def create_history_vectors_df( return pd.DataFrame() historical_vector_names = list(historical_vector_and_vector_name_dict.keys()) + + # NOTE: DateSpane = None + resampling_options = ( + ResamplingOptions(resampling_frequency, common_date_span=None) + if resampling_frequency + else None + ) historical_vectors_df = provider.get_vectors_df( - historical_vector_names, resampling_frequency, realizations=[realization] + historical_vector_names, resampling_options, realizations=[realization] ) return historical_vectors_df.rename(columns=historical_vector_and_vector_name_dict)