diff --git a/src/ert/config/design_matrix.py b/src/ert/config/design_matrix.py index e353e8efc32..2a919427c0c 100644 --- a/src/ert/config/design_matrix.py +++ b/src/ert/config/design_matrix.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, List, Optional +import numpy as np import pandas as pd from pandas.api.types import is_integer_dtype @@ -79,9 +80,19 @@ def read_design_matrix( """ Reads out all file content from different files and create dataframes """ + param_names = pd.read_excel( + self.xls_filename, + sheet_name=self.design_sheet, + nrows=1, + header=None, + dtype=str, + ).iloc[0] + if len(param_names) - len(set(param_names)) != 0: + raise ValueError("Duplicate parameter names found in design sheet") design_matrix_df = DesignMatrix._read_excel( self.xls_filename, self.design_sheet ) + if "REAL" in design_matrix_df.columns: if not is_integer_dtype(design_matrix_df.dtypes["REAL"]) or any( design_matrix_df["REAL"] < 0 @@ -90,13 +101,11 @@ def read_design_matrix( design_matrix_df = design_matrix_df.set_index( "REAL", drop=True, verify_integrity=True ) - try: - DesignMatrix._validate_design_matrix_header(design_matrix_df) - except ValueError as err: - raise ValueError(f"Design matrix not valid, error: {err!s}") from err - # Todo: Check for invalid realizations, drop them maybe? - # This should probably handle/(fill in) missing values in design_matrix_sheet as well? Or maybe not. + if error_list := DesignMatrix._validate_design_matrix(design_matrix_df): + error_msg = "\n".join(error_list) + raise ValueError(f"Design matrix is not valid, error:\n{error_msg}") + defaults = DesignMatrix._read_defaultssheet( self.xls_filename, self.default_sheet ) @@ -104,8 +113,6 @@ def read_design_matrix( if k not in design_matrix_df.columns: design_matrix_df[k] = v - # ignoring errors here is deprecated in pandas, should find another solution - # design_matrix_sheet = design_matrix_sheet.apply(pd.to_numeric, errors="ignore") parameter_configuration: dict[str, ParameterConfig] = {} transform_function_definitions: list[TransformFunctionDefinition] = [] for parameter in design_matrix_df.columns: @@ -156,13 +163,14 @@ def _read_excel( ) return dframe.dropna(axis=1, how="all") - def _validate_design_matrix_header(design_matrix: pd.DataFrame) -> None: + def _validate_design_matrix(design_matrix: pd.DataFrame) -> list[str]: """ Validate header in user inputted design matrix :raises: ValueError if design matrix contains empty headers """ if design_matrix.empty: return + errors = [] try: unnamed = design_matrix.loc[ :, design_matrix.columns.str.contains("^Unnamed") @@ -170,17 +178,34 @@ def _validate_design_matrix_header(design_matrix: pd.DataFrame) -> None: except ValueError as err: # We catch because int/floats as column headers # in xlsx gets read as int/float and is not valid to index by. - raise ValueError( - f"Invalid value in design matrix header, error: {err !s}" - ) from err - column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()] - if len(column_indexes) > 0: - raise ValueError(f"Column headers not present in column {column_indexes}") + errors.append(f"Invalid value in design matrix header, error: {err !s}") + else: + column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()] + if len(column_indexes) > 0: + errors.append(f"Column headers not present in column {column_indexes}") + + # Look for initial or trailing whitespace in column headers. This + # is disallowed as it can create user confusion and has no use-case. + for col_header in design_matrix: + if col_header != col_header.strip(): + errors.append( + ( + f"Column header '{col_header}' contains initial or trailing whitespace." + ) + ) + + empties = [ + f"Realization {design_matrix.index[i]}, column {design_matrix.columns[j]}" + for i, j in zip(*np.where(pd.isna(design_matrix))) + ] + if len(empties) > 0: + errors.append(f"Design matrix contains empty cells {empties}") + return errors @staticmethod def _read_defaultssheet( xlsfilename: Path | str, defaultssheetname: str - ) -> dict[str, str]: + ) -> dict[str, str | float]: """ Construct a dataframe of keys and values to be used as defaults from the first two columns in a spreadsheet. @@ -201,8 +226,15 @@ def _read_defaultssheet( for paramname in default_df.loc[:, 0]: if paramname != paramname.strip(): raise ValueError( - f'Parameter name "{paramname}" in default values contains ' + f"Parameter name '{paramname}' in default values contains " "initial or trailing whitespace." ) - return {row[0]: row[1] for _, row in default_df.iterrows()} + return {row[0]: convert_to_numeric(row[1]) for _, row in default_df.iterrows()} + + +def convert_to_numeric(x: str) -> str | float: + try: + return pd.to_numeric(x) + except ValueError: + return x diff --git a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py index 145a3d43220..a5c698d25f6 100644 --- a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py +++ b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py @@ -62,26 +62,78 @@ def test_reading_design_matrix_validate_reals(tmp_path, real_column, error_msg): design_matrix.read_design_matrix() -def test_reading_design_matrix_duplicate_columns(tmp_path): +@pytest.mark.parametrize( + "column_names, error_msg", + [ + pytest.param( + ["a", "b", "a"], + "Duplicate parameter names found in design sheet", + id="duplicate entries", + ), + pytest.param( + ["a", "b ", ""], + r"Column headers not present in column \[2\]", + id="missing entries", + ), + pytest.param( + ["a", "b", 10], + "Invalid value in design matrix header, error: Cannot mask with non-boolean array containing NA / NaN values", + id="float entries", + ), + pytest.param( + ["a", "b", " som "], + r"Column header ' som ' contains initial or trailing whitespace.", + id="float entries", + ), + ], +) +def test_reading_design_matrix_validate_headers(tmp_path, column_names, error_msg): + design_path = tmp_path / "design_matrix.xlsx" + design_matrix_df = pd.DataFrame( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=column_names + ) + default_sheet_df = pd.DataFrame([["one", 1], ["b", 4], ["d", 6]]) + with pd.ExcelWriter(design_path) as xl_write: + design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01") + default_sheet_df.to_excel( + xl_write, index=False, sheet_name="DefaultValues", header=False + ) + design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues") + with pytest.raises(ValueError, match=error_msg): + design_matrix.read_design_matrix() + + +@pytest.mark.parametrize( + "values, error_msg", + [ + pytest.param( + [0, pd.NA, 1], + r"Design matrix contains empty cells \['Realization 5, column a'\]", + id="duplicate entries", + ), + pytest.param( + [0, "some", np.nan], + r"Design matrix contains empty cells \['Realization 7, column a'\]", + id="invalid float values", + ), + ], +) +def test_reading_design_matrix_validate_cells(tmp_path, values, error_msg): design_path = tmp_path / "design_matrix.xlsx" design_matrix_df = pd.DataFrame( { - "REAL": [0, 1, -4], - "a": [1, 2, 3], + "REAL": [1, 5, 7], + "a": values, "b": [0, 2, 0], "c": [3, 1, 3], - "0": ["a", 2, "c"], } ) - design_matrix_df = pd.DataFrame( - np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "a"] - ) - default_sheet_df = pd.DataFrame([["one", 1], ["b", 4], ["d", 6]]) + default_sheet_df = pd.DataFrame() with pd.ExcelWriter(design_path) as xl_write: design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01") default_sheet_df.to_excel( xl_write, index=False, sheet_name="DefaultValues", header=False ) design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues") - design_matrix.read_design_matrix() - print("\n The design matrix:\n", design_matrix.design_matrix_df) + with pytest.raises(ValueError, match=error_msg): + design_matrix.read_design_matrix()