Skip to content

Commit

Permalink
Improve validation
Browse files Browse the repository at this point in the history
  • Loading branch information
larsevj committed Oct 9, 2024
1 parent 2ce1d41 commit 6732ce9
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 28 deletions.
68 changes: 50 additions & 18 deletions src/ert/config/design_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional

import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype

Expand Down Expand Up @@ -79,9 +80,19 @@ def read_design_matrix(
"""
Reads out all file content from different files and create dataframes
"""
param_names = pd.read_excel(
self.xls_filename,
sheet_name=self.design_sheet,
nrows=1,
header=None,
dtype=str,
).iloc[0]
if len(param_names) - len(set(param_names)) != 0:
raise ValueError("Duplicate parameter names found in design sheet")
design_matrix_df = DesignMatrix._read_excel(
self.xls_filename, self.design_sheet
)

if "REAL" in design_matrix_df.columns:
if not is_integer_dtype(design_matrix_df.dtypes["REAL"]) or any(
design_matrix_df["REAL"] < 0
Expand All @@ -90,22 +101,18 @@ def read_design_matrix(
design_matrix_df = design_matrix_df.set_index(
"REAL", drop=True, verify_integrity=True
)
try:
DesignMatrix._validate_design_matrix_header(design_matrix_df)
except ValueError as err:
raise ValueError(f"Design matrix not valid, error: {err!s}") from err

# Todo: Check for invalid realizations, drop them maybe?
# This should probably handle/(fill in) missing values in design_matrix_sheet as well? Or maybe not.
if error_list := DesignMatrix._validate_design_matrix(design_matrix_df):
error_msg = "\n".join(error_list)
raise ValueError(f"Design matrix is not valid, error:\n{error_msg}")

defaults = DesignMatrix._read_defaultssheet(
self.xls_filename, self.default_sheet
)
for k, v in defaults.items():
if k not in design_matrix_df.columns:
design_matrix_df[k] = v

# ignoring errors here is deprecated in pandas, should find another solution
# design_matrix_sheet = design_matrix_sheet.apply(pd.to_numeric, errors="ignore")
parameter_configuration: dict[str, ParameterConfig] = {}
transform_function_definitions: list[TransformFunctionDefinition] = []
for parameter in design_matrix_df.columns:
Expand Down Expand Up @@ -156,31 +163,49 @@ def _read_excel(
)
return dframe.dropna(axis=1, how="all")

def _validate_design_matrix_header(design_matrix: pd.DataFrame) -> None:
def _validate_design_matrix(design_matrix: pd.DataFrame) -> list[str]:
"""
Validate header in user inputted design matrix
:raises: ValueError if design matrix contains empty headers
"""
if design_matrix.empty:
return

Check failure on line 172 in src/ert/config/design_matrix.py

View workflow job for this annotation

GitHub Actions / type-checking (3.12)

Return value expected
errors = []
try:
unnamed = design_matrix.loc[
:, design_matrix.columns.str.contains("^Unnamed")
]
except ValueError as err:
# We catch because int/floats as column headers
# in xlsx gets read as int/float and is not valid to index by.
raise ValueError(
f"Invalid value in design matrix header, error: {err !s}"
) from err
column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()]
if len(column_indexes) > 0:
raise ValueError(f"Column headers not present in column {column_indexes}")
errors.append(f"Invalid value in design matrix header, error: {err !s}")
else:
column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()]
if len(column_indexes) > 0:
errors.append(f"Column headers not present in column {column_indexes}")

# Look for initial or trailing whitespace in column headers. This
# is disallowed as it can create user confusion and has no use-case.
for col_header in design_matrix:
if col_header != col_header.strip():
errors.append(
(
f"Column header '{col_header}' contains initial or trailing whitespace."
)
)

empties = [
f"Realization {design_matrix.index[i]}, column {design_matrix.columns[j]}"
for i, j in zip(*np.where(pd.isna(design_matrix)))
]
if len(empties) > 0:
errors.append(f"Design matrix contains empty cells {empties}")
return errors

@staticmethod
def _read_defaultssheet(
xlsfilename: Path | str, defaultssheetname: str
) -> dict[str, str]:
) -> dict[str, str | float]:
"""
Construct a dataframe of keys and values to be used as defaults from the
first two columns in a spreadsheet.
Expand All @@ -201,8 +226,15 @@ def _read_defaultssheet(
for paramname in default_df.loc[:, 0]:
if paramname != paramname.strip():
raise ValueError(
f'Parameter name "{paramname}" in default values contains '
f"Parameter name '{paramname}' in default values contains "
"initial or trailing whitespace."
)

return {row[0]: row[1] for _, row in default_df.iterrows()}
return {row[0]: convert_to_numeric(row[1]) for _, row in default_df.iterrows()}


def convert_to_numeric(x: str) -> str | float:
try:
return pd.to_numeric(x)
except ValueError:
return x
72 changes: 62 additions & 10 deletions tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,26 +62,78 @@ def test_reading_design_matrix_validate_reals(tmp_path, real_column, error_msg):
design_matrix.read_design_matrix()


def test_reading_design_matrix_duplicate_columns(tmp_path):
@pytest.mark.parametrize(
"column_names, error_msg",
[
pytest.param(
["a", "b", "a"],
"Duplicate parameter names found in design sheet",
id="duplicate entries",
),
pytest.param(
["a", "b ", ""],
r"Column headers not present in column \[2\]",
id="missing entries",
),
pytest.param(
["a", "b", 10],
"Invalid value in design matrix header, error: Cannot mask with non-boolean array containing NA / NaN values",
id="float entries",
),
pytest.param(
["a", "b", " som "],
r"Column header ' som ' contains initial or trailing whitespace.",
id="float entries",
),
],
)
def test_reading_design_matrix_validate_headers(tmp_path, column_names, error_msg):
design_path = tmp_path / "design_matrix.xlsx"
design_matrix_df = pd.DataFrame(
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=column_names
)
default_sheet_df = pd.DataFrame([["one", 1], ["b", 4], ["d", 6]])
with pd.ExcelWriter(design_path) as xl_write:
design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01")
default_sheet_df.to_excel(
xl_write, index=False, sheet_name="DefaultValues", header=False
)
design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
with pytest.raises(ValueError, match=error_msg):
design_matrix.read_design_matrix()


@pytest.mark.parametrize(
"values, error_msg",
[
pytest.param(
[0, pd.NA, 1],
r"Design matrix contains empty cells \['Realization 5, column a'\]",
id="duplicate entries",
),
pytest.param(
[0, "some", np.nan],
r"Design matrix contains empty cells \['Realization 7, column a'\]",
id="invalid float values",
),
],
)
def test_reading_design_matrix_validate_cells(tmp_path, values, error_msg):
design_path = tmp_path / "design_matrix.xlsx"
design_matrix_df = pd.DataFrame(
{
"REAL": [0, 1, -4],
"a": [1, 2, 3],
"REAL": [1, 5, 7],
"a": values,
"b": [0, 2, 0],
"c": [3, 1, 3],
"0": ["a", 2, "c"],
}
)
design_matrix_df = pd.DataFrame(
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "a"]
)
default_sheet_df = pd.DataFrame([["one", 1], ["b", 4], ["d", 6]])
default_sheet_df = pd.DataFrame()
with pd.ExcelWriter(design_path) as xl_write:
design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01")
default_sheet_df.to_excel(
xl_write, index=False, sheet_name="DefaultValues", header=False
)
design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues")
design_matrix.read_design_matrix()
print("\n The design matrix:\n", design_matrix.design_matrix_df)
with pytest.raises(ValueError, match=error_msg):
design_matrix.read_design_matrix()

0 comments on commit 6732ce9

Please sign in to comment.