Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix precision loss issue in calculation of standard deviation. #9625

Merged
merged 2 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/ert/analysis/_es_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def _load_observations_and_responses(

# Identifies non-outlier observations based on responses.
ens_mean = S.mean(axis=1)
ens_std = S.std(ddof=0, axis=1)
ens_std = S.std(ddof=0, axis=1, dtype=np.float64).astype(np.float32)
ens_std_mask = ens_std > std_cutoff
ens_mean_mask = abs(observations - ens_mean) <= alpha * (ens_std + scaled_errors)
obs_mask = np.logical_and(ens_mean_mask, ens_std_mask)
Expand Down
241 changes: 241 additions & 0 deletions tests/ert/ui_tests/cli/test_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
from __future__ import annotations

import os
import stat
import warnings
from pathlib import Path

import hypothesis.strategies as st
import numpy as np
from hypothesis import given, note, settings
from pytest import MonkeyPatch, TempPathFactory

from ert.cli.main import ErtCliError
from ert.config.gen_kw_config import DISTRIBUTION_PARAMETERS
from ert.mode_definitions import ENSEMBLE_SMOOTHER_MODE
from ert.storage import open_storage

from .run_cli import run_cli_with_pm

names = st.text(
min_size=1,
max_size=8,
alphabet=st.characters(
min_codepoint=ord("!"),
max_codepoint=ord("~"),
exclude_characters="\"'$,:%", # These have specific meaning in configs
),
)


@st.composite
def distribution_values(draw, k, vs):
d = {}
biggest = 100.0
if "LOG" in k:
biggest = 10.0
epsilon = biggest / 1000.0
if "MIN" in vs:
d["MIN"] = draw(st.floats(min_value=epsilon, max_value=biggest / 10.0))
if "MAX" in vs:
d["MAX"] = draw(st.floats(min_value=d["MIN"] + 5 * epsilon, max_value=biggest))
if "MEAN" in vs:
d["MEAN"] = draw(
st.floats(
min_value=d.get("MIN", 2 * epsilon) + epsilon,
max_value=d.get("MAX", biggest) - epsilon,
)
)
if "MODE" in vs:
d["MODE"] = draw(
st.floats(
min_value=d.get("MIN", 2 * epsilon) + epsilon,
max_value=d.get("MAX", biggest) - epsilon,
)
)
if "STEPS" in vs:
d["STEPS"] = draw(st.integers(min_value=2, max_value=10))
return [d.get(v, draw(st.floats(min_value=0.1, max_value=1.0))) for v in vs]


distributions = st.one_of(
[
st.tuples(
st.just(k),
distribution_values(k, vs),
)
for k, vs in DISTRIBUTION_PARAMETERS.items()
]
)

config_contents = """\
NUM_REALIZATIONS {num_realizations}
QUEUE_SYSTEM LOCAL
QUEUE_OPTION LOCAL MAX_RUNNING {num_realizations}
ENSPATH storage
RANDOM_SEED 1234

OBS_CONFIG observations
GEN_KW COEFFS coeff_priors
GEN_DATA POLY_RES RESULT_FILE:poly.out

INSTALL_JOB poly_eval POLY_EVAL
FORWARD_MODEL poly_eval

ANALYSIS_SET_VAR OBSERVATIONS AUTO_SCALE *
"""

coeff_priors = """\
coeff_0 {distribution0} 0 1
coeff_1 {distribution1} 0 2
coeff_2 {distribution2} 0 5
"""

observation = """
GENERAL_OBSERVATION POLY_OBS_{i} {{
DATA = POLY_RES;
INDEX_FILE = index_{i}.txt;
OBS_FILE = poly_obs_{i}.txt;
}};
"""

poly_eval = """\
#!/usr/bin/env python3
import json
import numpy as np
coeffs = json.load(open("parameters.json"))["COEFFS"]
c = [np.array(coeffs[f"coeff_" + str(i)]) for i in range(len(coeffs))]
with open("poly.out", "w", encoding="utf-8") as f:
f.write("\\n".join(map(str, [np.polyval(c, x) for x in range({num_points})])))
"""

POLY_EVAL = "EXECUTABLE poly_eval.py"


@settings(max_examples=10)
@given(
num_realizations=st.integers(min_value=20, max_value=40),
num_points=st.integers(min_value=1, max_value=20),
distributions=st.lists(distributions, min_size=1, max_size=10),
data=st.data(),
)
def test_update_lowers_generalized_variance_or_deactives_observations(
tmp_path_factory: TempPathFactory,
num_realizations: int,
num_points: int,
distributions: list[tuple[str, list[float]]],
data,
):
indecies = data.draw(
st.lists(
st.integers(min_value=0, max_value=num_points - 1),
min_size=1,
max_size=num_points,
unique=True,
)
)
values = data.draw(
st.lists(
st.floats(min_value=-10.0, max_value=10.0),
min_size=len(indecies),
max_size=len(indecies),
)
)
errs = data.draw(
st.lists(
st.floats(min_value=0.1, max_value=0.5),
min_size=len(indecies),
max_size=len(indecies),
)
)
num_groups = data.draw(st.integers(min_value=1, max_value=num_points))
per_group = num_points // num_groups
print(num_groups, num_points, per_group)

tmp_path = tmp_path_factory.mktemp("parameter_example")
note(f"Running in directory {tmp_path}")
with MonkeyPatch.context() as patch:
patch.chdir(tmp_path)
contents = config_contents.format(
num_realizations=num_realizations,
)
note(f"config file: {contents}")
Path("config.ert").write_text(contents, encoding="utf-8")
py = Path("poly_eval.py")
py.write_text(poly_eval.format(num_points=num_points))
mode = os.stat(py)
os.chmod(py, mode.st_mode | stat.S_IEXEC)

for i in range(num_groups):
print(f"{i * per_group } {(i + 1) * per_group}")
with open("observations", mode="a", encoding="utf-8") as f:
f.write(observation.format(i=i))
Path(f"poly_obs_{i}.txt").write_text(
"\n".join(
f"{x} {y}"
for x, y in zip(
values[i * per_group : (i + 1) * per_group],
errs[i * per_group : (i + 1) * per_group],
strict=False,
)
),
encoding="utf-8",
)
Path(f"index_{i}.txt").write_text(
"\n".join(
f"{x}" for x in indecies[i * per_group : (i + 1) * per_group]
),
encoding="utf-8",
)

Path("coeff_priors").write_text(
"\n".join(
f"coeff_{i} {d} {' '.join(str(p) for p in v)}"
for i, (d, v) in enumerate(distributions)
),
encoding="utf-8",
)
Path("POLY_EVAL").write_text(POLY_EVAL, encoding="utf-8")

success = True
with warnings.catch_warnings(record=True) as all_warnings:
warnings.simplefilter("always")
try:
run_cli_with_pm(
[
ENSEMBLE_SMOOTHER_MODE,
"--disable-monitor",
"--experiment-name",
"experiment",
"config.ert",
]
)
except ErtCliError as err:
success = False
se = str(err)
# "No active observations" is expected
# when std deviation is too low, the
# other errors are discussed here:
# https://github.com/equinor/ert/issues/9581
# https://github.com/equinor/ert/issues/9585
assert (
"No active observations" in se
or "math domain error" in se
or "math range error" in se
)

if any("Ill-conditioned matrix" not in str(w.message) for w in all_warnings):
success = False

if success:
with open_storage("storage") as storage:
experiment = storage.get_experiment_by_name("experiment")
prior = experiment.get_ensemble_by_name("iter-0").load_all_gen_kw_data()
posterior = experiment.get_ensemble_by_name(
"iter-1"
).load_all_gen_kw_data()

assert (
np.linalg.det(posterior.cov().to_numpy())
<= np.linalg.det(prior.cov().to_numpy()) + 0.001
)
124 changes: 124 additions & 0 deletions tests/ert/unit_tests/analysis/test_es_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ert.config.analysis_module import ESSettings, IESSettings
from ert.config.gen_kw_config import TransformFunctionDefinition
from ert.field_utils import Shape
from ert.storage import open_storage


@pytest.fixture
Expand Down Expand Up @@ -204,6 +205,129 @@ def test_update_report_with_different_observation_status_from_smoother_update(
assert data_section.extra["Deactivated observations - outliers"] == str(outliers)


def test_update_handles_precision_loss_in_std_dev(tmp_path):
"""
This is a regression test for precision loss in calculating
standard deviation.
"""
gen_kw = GenKwConfig(
name="COEFFS",
forward_init=False,
update=True,
template_file=None,
output_file=None,
transform_function_definitions=[
TransformFunctionDefinition(
name="coeff_0", param_name="CONST", values=["0.1"]
),
],
)
# The values given here are chosen so that when computing
# `ens_std = S.std(ddof=0, axis=1)`, ens_std[0] is not zero even though
# all responses have the same value: 5.08078746e07.
# This is due to precision loss.
with open_storage(tmp_path, mode="w") as storage:
experiment = storage.create_experiment(
name="ensemble_smoother",
parameters=[gen_kw],
observations={
"gen_data": polars.DataFrame(
{
"response_key": "RES",
"observation_key": "OBS",
"report_step": polars.Series(np.zeros(3), dtype=polars.UInt16),
"index": polars.Series([0, 1, 2], dtype=polars.UInt16),
"observations": polars.Series(
[-218285263.28648496, -999999999.0, -107098474.0148249],
dtype=polars.Float32,
),
"std": polars.Series(
[559437122.6211826, 999999999.9999999, 1.9],
dtype=polars.Float32,
),
}
)
},
responses=[
GenDataConfig(
name="gen_data",
input_files=["poly.out"],
keys=["RES"],
has_finalized_keys=True,
report_steps_list=[None],
)
],
)
prior = storage.create_ensemble(experiment.id, ensemble_size=23, name="prior")
for realization_nr in range(prior.ensemble_size):
ds = gen_kw.sample_or_load(
realization_nr,
random_seed=1234,
ensemble_size=prior.ensemble_size,
)
prior.save_parameters("COEFFS", realization_nr, ds)

prior.save_response(
"gen_data",
polars.DataFrame(
{
"response_key": "RES",
"report_step": polars.Series(np.zeros(3), dtype=polars.UInt16),
"index": polars.Series(np.arange(3), dtype=polars.UInt16),
"values": polars.Series(
np.array([5.08078746e07, 4.07677769e10, 2.28002538e12]),
dtype=polars.Float32,
),
}
),
0,
)
for i in range(1, prior.ensemble_size):
prior.save_response(
"gen_data",
polars.DataFrame(
{
"response_key": "RES",
"report_step": polars.Series(np.zeros(3), dtype=polars.UInt16),
"index": polars.Series(np.arange(3), dtype=polars.UInt16),
"values": polars.Series(
np.array([5.08078744e07, 4.12422210e09, 1.26490794e10]),
dtype=polars.Float32,
),
}
),
i,
)

posterior = storage.create_ensemble(
prior.experiment_id,
ensemble_size=prior.ensemble_size,
iteration=1,
name="posterior",
prior_ensemble=prior,
)
events = []

ss = smoother_update(
prior,
posterior,
experiment.observation_keys,
["COEFFS"],
UpdateSettings(auto_scale_observations=[["OBS*"]]),
ESSettings(),
progress_callback=events.append,
)

assert (
sum(
1
for e in ss.update_step_snapshots
if e.status == ObservationStatus.STD_CUTOFF
)
== 1
)


@pytest.mark.parametrize(
"module, expected_gen_kw",
[
Expand Down
Loading