From 4686862d8049400b4adf171ef7776a7747a4554e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Therese=20Natter=C3=B8y?= <61694854+tnatt@users.noreply.github.com> Date: Tue, 21 May 2024 22:24:20 +0200 Subject: [PATCH] MAINT: Split out preprocessed logic into separate class --- src/fmu/dataio/__init__.py | 1 + src/fmu/dataio/_metadata.py | 45 +-- src/fmu/dataio/_utils.py | 25 +- src/fmu/dataio/dataio.py | 83 ++--- .../datastructure/_internal/internal.py | 8 +- src/fmu/dataio/preprocessed.py | 265 +++++++++++++++ src/fmu/dataio/providers/_filedata.py | 6 +- src/fmu/dataio/providers/_fmu.py | 2 +- src/fmu/dataio/providers/objectdata/_base.py | 28 -- .../dataio/providers/objectdata/_provider.py | 33 -- .../test_objectdataprovider_class.py | 6 +- tests/test_units/test_preprocessed.py | 317 ++++++++++++++++++ .../test_prerealization_surfaces.py | 112 +++---- 13 files changed, 679 insertions(+), 252 deletions(-) create mode 100644 src/fmu/dataio/preprocessed.py create mode 100644 tests/test_units/test_preprocessed.py diff --git a/src/fmu/dataio/__init__.py b/src/fmu/dataio/__init__.py index 7e9440a29..f16056158 100644 --- a/src/fmu/dataio/__init__.py +++ b/src/fmu/dataio/__init__.py @@ -5,6 +5,7 @@ from fmu.dataio.dataio import ExportData # noqa # type: ignore from fmu.dataio.dataio import InitializeCase # noqa # type: ignore from fmu.dataio.dataio import read_metadata # noqa +from fmu.dataio.preprocessed import ExportPreprocessedData # noqa # type: ignore try: from .version import version diff --git a/src/fmu/dataio/_metadata.py b/src/fmu/dataio/_metadata.py index b805b15c9..1fe05b703 100644 --- a/src/fmu/dataio/_metadata.py +++ b/src/fmu/dataio/_metadata.py @@ -10,19 +10,14 @@ import os import platform from datetime import timezone -from pathlib import Path -from typing import TYPE_CHECKING, Final +from typing import TYPE_CHECKING, Final, Literal from pydantic import AnyHttpUrl, TypeAdapter from . import types from ._definitions import SCHEMA, SOURCE, VERSION, FmuContext from ._logging import null_logger -from ._utils import ( - drop_nones, - glue_metadata_preprocessed, - read_metadata_from_file, -) +from ._utils import drop_nones from .datastructure._internal import internal from .datastructure.meta import meta from .exceptions import InvalidMetadataError @@ -38,12 +33,14 @@ logger: Final = null_logger(__name__) -def generate_meta_tracklog() -> list[meta.TracklogEvent]: +def generate_meta_tracklog( + event: Literal["created", "merged"] = "created", +) -> list[meta.TracklogEvent]: """Initialize the tracklog with the 'created' event only.""" return [ meta.TracklogEvent.model_construct( datetime=datetime.datetime.now(timezone.utc), - event="created", + event=event, user=meta.User.model_construct(id=getpass.getuser()), sysinfo=meta.SystemInformation.model_construct( fmu_dataio=meta.VersionInformation.model_construct(version=__version__), @@ -107,14 +104,6 @@ def _get_meta_display(dataio: ExportData, objdata: ObjectDataProvider) -> meta.D return meta.Display(name=dataio.display_name or objdata.name) -def _get_meta_preprocessed_info(dataio: ExportData) -> internal.PreprocessedInfo: - return internal.PreprocessedInfo( - name=dataio.name, - tagname=dataio.tagname, - subfolder=dataio.subfolder, - ) - - def generate_export_metadata( obj: types.Inferrable, dataio: ExportData, @@ -149,12 +138,7 @@ def generate_export_metadata( """ - meta_existing = None - if isinstance(obj, (str, Path)) and dataio._reuse_metadata: - logger.info("Partially reuse existing metadata from %s", obj) - meta_existing = read_metadata_from_file(obj) - - objdata = objectdata_provider_factory(obj, dataio, meta_existing) + objdata = objectdata_provider_factory(obj, dataio) masterdata = dataio.config.get("masterdata") metadata = internal.DataClassMeta( @@ -169,18 +153,7 @@ def generate_export_metadata( file=_get_meta_filedata(dataio, obj, objdata, fmudata, compute_md5), tracklog=generate_meta_tracklog(), display=_get_meta_display(dataio, objdata), - preprocessed=( - _get_meta_preprocessed_info(dataio) - if dataio.fmu_context == FmuContext.PREPROCESSED - else None - ), + preprocessed=dataio.fmu_context == FmuContext.PREPROCESSED, ).model_dump(mode="json", exclude_none=True, by_alias=True) - if skip_null: - metadata = drop_nones(metadata) - - return ( - metadata - if not meta_existing - else glue_metadata_preprocessed(oldmeta=meta_existing, newmeta=metadata.copy()) - ) + return metadata if not skip_null else drop_nones(metadata) diff --git a/src/fmu/dataio/_utils.py b/src/fmu/dataio/_utils.py index dd8246522..9a51f9d9b 100644 --- a/src/fmu/dataio/_utils.py +++ b/src/fmu/dataio/_utils.py @@ -6,7 +6,6 @@ import hashlib import json import os -import shutil import uuid from copy import deepcopy from pathlib import Path @@ -114,10 +113,7 @@ def export_file( ) -> str: """Export a valid object to file""" - if isinstance(obj, (Path, str)): - # special case when processing data which already has metadata - shutil.copy(obj, filename) - elif filename.suffix == ".gri" and isinstance(obj, xtgeo.RegularSurface): + if filename.suffix == ".gri" and isinstance(obj, xtgeo.RegularSurface): obj.to_file(filename, fformat="irap_binary") elif filename.suffix == ".csv" and isinstance(obj, (xtgeo.Polygons, xtgeo.Points)): out = obj.copy() # to not modify incoming instance! @@ -436,22 +432,3 @@ def read_metadata_from_file(filename: str | Path) -> dict: raise OSError(f"Cannot find requested metafile: {metafile}") with open(metafilepath) as stream: return yaml.safe_load(stream) - - -def glue_metadata_preprocessed( - oldmeta: dict[str, Any], newmeta: dict[str, Any] -) -> dict[str, Any]: - """Glue (combine) to metadata dicts according to rule 'preprocessed'.""" - - meta = oldmeta.copy() - - if "_preprocessed" in meta: - del meta["_preprocessed"] - - meta["fmu"] = newmeta["fmu"] - meta["file"] = newmeta["file"] - - newmeta["tracklog"][-1]["event"] = "merged" - meta["tracklog"].extend(newmeta["tracklog"]) - - return meta diff --git a/src/fmu/dataio/dataio.py b/src/fmu/dataio/dataio.py index d98f30d59..11a1383f0 100644 --- a/src/fmu/dataio/dataio.py +++ b/src/fmu/dataio/dataio.py @@ -32,6 +32,7 @@ from .case import InitializeCase from .datastructure.configuration import global_configuration from .datastructure.meta import enums +from .preprocessed import ExportPreprocessedData from .providers._fmu import FmuProvider, get_fmu_context_from_environment # DATAIO_EXAMPLES: Final = dataio_examples() @@ -54,6 +55,17 @@ # ====================================================================================== +def _future_warning_preprocessed() -> None: + warnings.warn( + "Using the ExportData class for re-exporting preprocessed data is no " + "longer supported. Use the dedicated ExportPreprocessedData class " + "instead. In a deprecation period the ExportPreprocessedData is used " + "under the hood when a filepath is input to ExportData. " + "Please update your script, as this will be discontinued in the future.", + FutureWarning, + ) + + def _validate_variable(key: str, value: type, legals: dict[str, str | type]) -> bool: """Use data from __annotions__ to validate that overriden var. is of legal type.""" if key not in legals: @@ -368,7 +380,6 @@ class ExportData: _pwd: Path = field(default_factory=Path, init=False) _config_is_valid: bool = field(default=True, init=False) _fmurun: bool = field(default=False, init=False) - _reuse_metadata: bool = field(default=False, init=False) # Need to store these temporarily in variables until we stop # updating state of the class also on export and generate_metadata @@ -378,9 +389,6 @@ class ExportData: # << NB! storing ACTUAL casepath: _rootpath: Path = field(default_factory=Path, init=False) - # in some cases input object may change class; store the internal variable here: - _object: types.Inferrable = field(init=False) - def __post_init__(self) -> None: logger.info("Running __post_init__ ExportData") logger.debug("Global config is %s", prettyprint_dict(self.config)) @@ -694,37 +702,6 @@ def _establish_rootpath(self) -> Path: logger.info("Running outside FMU context, using pwd as roothpath") return self._pwd - def _check_process_object(self, obj: types.Inferrable) -> None: - """When obj is file-like, it must be checked + assume preprocessed. - - In addition, if preprocessed, derive the name, tagname, subfolder if present and - those are not set already. - - For all cases, tie incoming obj to self._object - """ - - if isinstance(obj, (str, Path)): - obj = Path(obj) - if not obj.exists(): - raise ValidationError(f"The file {obj} does not exist.") - - self._reuse_metadata = True - - currentmeta = read_metadata(obj) - if "_preprocessed" not in currentmeta: - raise ValidationError( - "The special entry for preprocessed data <_preprocessed> is" - "missing in the metadata. A possible solution is to rerun the" - "preprocessed export." - ) - preprocessed = currentmeta["_preprocessed"] - - self.name = self.name or preprocessed.get("name", "") - self.tagname = self.tagname or preprocessed.get("tagname", "") - self.subfolder = self.subfolder or preprocessed.get("subfolder", "") - - self._object = obj - def _get_fmu_provider(self) -> FmuProvider: assert isinstance(self.fmu_context, FmuContext) return FmuProvider( @@ -773,16 +750,22 @@ def generate_metadata( self._update_check_settings(kwargs) + if isinstance(obj, (str, Path)): + assert self.casepath is not None + _future_warning_preprocessed() + return ExportPreprocessedData( + config=self.config, + casepath=self.casepath, + is_observation=self.is_observation, + ).generate_metadata(obj) + self._classification = self._get_classification() self._rep_include = self._get_rep_include() - self._check_process_object(obj) # obj --> self._object self._update_fmt_flag() fmudata = self._get_fmu_provider() if self._fmurun else None - # TODO: refactor the argument list for generate_export_metadata; we do not need - # both self._object and self... self._metadata = generate_export_metadata( - self._object, self, fmudata, compute_md5=compute_md5 + obj, self, fmudata, compute_md5=compute_md5 ) logger.info("The metadata are now ready!") @@ -816,18 +799,24 @@ def export( warnings.warn( "The return_symlink option is deprecated and can safely be removed." ) - self.generate_metadata(obj, compute_md5=True, **kwargs) - metadata = self._metadata - logger.info("Object type is: %s", type(self._object)) # from generate_metadata + if isinstance(obj, (str, Path)): + assert self.casepath is not None + _future_warning_preprocessed() + return ExportPreprocessedData( + config=self.config, + casepath=self.casepath, + is_observation=self.is_observation, + ).export(obj) + + metadata = self.generate_metadata(obj, compute_md5=True, **kwargs) + logger.info("Object type is: %s", type(obj)) outfile = Path(metadata["file"]["absolute_path"]) # create output folders if they don't exist outfile.parent.mkdir(parents=True, exist_ok=True) - metafile = outfile.parent / ("." + str(outfile.name) + ".yml") + metafile = outfile.parent / f".{outfile.name}.yml" - logger.info("Export to file using flag: <%s>", self._usefmtflag) - # md5sum is already present in the metadata - export_file(self._object, outfile, flag=self._usefmtflag) + export_file(obj, outfile, flag=self._usefmtflag) logger.info("Actual file is: %s", outfile) if self._config_is_valid: @@ -836,6 +825,4 @@ def export( else: warnings.warn("Data will be exported, but without metadata.", UserWarning) - self._metadata = metadata - return str(outfile) diff --git a/src/fmu/dataio/datastructure/_internal/internal.py b/src/fmu/dataio/datastructure/_internal/internal.py index 1e5cbdeb9..d1ba950e9 100644 --- a/src/fmu/dataio/datastructure/_internal/internal.py +++ b/src/fmu/dataio/datastructure/_internal/internal.py @@ -114,12 +114,6 @@ class FMUModel(BaseModel): case: meta.FMUCase -class PreprocessedInfo(BaseModel): - name: str - tagname: str - subfolder: str - - class Context(BaseModel, use_enum_values=True): stage: FmuContext @@ -172,7 +166,7 @@ class DataClassMeta(JsonSchemaMetadata): file: meta.File display: meta.Display tracklog: List[meta.TracklogEvent] - preprocessed: Optional[PreprocessedInfo] = Field(alias="_preprocessed") + preprocessed: Optional[bool] = Field(alias="_preprocessed", default=None) class CaseSchema(JsonSchemaMetadata): diff --git a/src/fmu/dataio/preprocessed.py b/src/fmu/dataio/preprocessed.py new file mode 100644 index 000000000..2b8b9ad3c --- /dev/null +++ b/src/fmu/dataio/preprocessed.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import shutil +import warnings +from dataclasses import dataclass, field +from pathlib import Path +from typing import Final + +import yaml +from pydantic import ValidationError + +from ._definitions import FmuContext +from ._logging import null_logger +from ._metadata import generate_meta_tracklog +from ._utils import export_metadata_file, md5sum +from .datastructure._internal import internal +from .datastructure.meta import meta +from .exceptions import InvalidMetadataError +from .providers._filedata import ShareFolder +from .providers._fmu import ( + ERT_RELATIVE_CASE_METADATA_FILE, + FmuProvider, + get_fmu_context_from_environment, +) + +logger: Final = null_logger(__name__) + +# ###################################################################################### +# ExportPreprocessedData. +# +# The ExportPreprocessedData is used for exporting preprocessed data that already +# contains metadata, into a FMU run. +# ###################################################################################### + + +@dataclass +class ExportPreprocessedData: + """Export a preprocessed file and its metadata into a FMU run at case level. + + The existing metadata will be validated and three fields will be updated + - The 'fmu' block will be added with information about the existing FMU/ERT run + - The 'file' block will be updated with new file paths. + - The 'tracklog' block will be extended with a new event tagged "merged". + + Note it is important that the preprocessed data have been created upfront with the, + ExportData class using the argument fmu_context='preprocessed'. This ensures + that the file and metadata is stored in the 'share/preprocessed/' folder. + + Args: + config: Required dictionary with 'model' information; 'name' and 'revision'. + Example is {'model': {'name': 'mymodelname', 'revision': '1.0.0'}} + Normally read from FMU global variables (via fmuconfig). + + casepath: Required casepath for the active ERT experiment. The case needs to + contain valid case metadata i.e. the ERT workflow 'WF_CREATE_CASE_METADATA' + has been run prior to using this class. + + is_observation: Default is True. If True, then disk storage will be on the + "casepath/share/observations" folder, otherwise on casepath/share/result. + """ + + config: dict + casepath: str | Path + is_observation: bool = True + + _fmudata: FmuProvider | None = field(default=None) + + def __post_init__(self) -> None: + if get_fmu_context_from_environment() != FmuContext.CASE: + raise RuntimeError( + "Only possible to run re-export of preprocessed data inside FMU " + "using a pre-simulation workflow in ERT." + ) + + self._fmudata = FmuProvider( + model=self.config.get("model"), + fmu_context=FmuContext.CASE, + casepath_proposed=Path(self.casepath), + workflow=None, + ) + + if not (casepath := self._fmudata.get_casepath()): + raise ValueError( + "Could not detect valid case metadata at file location:" + f"{Path(self.casepath) / ERT_RELATIVE_CASE_METADATA_FILE}. Provide an " + "updated casepath. Note, it is required to have run the ERT workflow " + "'WF_CREATE_CASE_METADATA' prior to this export job. See how-to here: " + "https://fmu-dataio.readthedocs.io/en/latest/" + "preparations.html#workflow-for-creating-case-metadata" + ) + + self.casepath = casepath.absolute() + + @staticmethod + def _validate_object(obj: str | Path) -> Path: + """ + Check that the input object is an existing file and convert it + to an absolute path. + """ + if not isinstance(obj, (str, Path)): + raise ValueError("Only file paths are supported as input object") + + objfile = Path(obj).resolve() + if not objfile.exists(): + raise FileNotFoundError(f"The file {obj} does not exist.") + + if ShareFolder.PREPROCESSED not in str(objfile): + raise RuntimeError( + f"Exporting files located outside the '{ShareFolder.PREPROCESSED}' " + "folder is not supported. Please re-export your objects to disk " + "using ExportData(fmu_context='preprocessed')" + ) + return objfile + + @staticmethod + def _read_metadata_file(objmetafile: Path) -> dict | None: + """ + Return a metadata file as a dictionary. If the metadata file + is not present, None will be returned. + """ + if objmetafile.exists(): + with open(objmetafile, encoding="utf-8") as stream: + return yaml.safe_load(stream) + return None + + def _get_relative_export_path(self, existing_path: Path) -> Path: + """Get an updated relative_path from an existing path.""" + + existing_share_folder = ShareFolder.PREPROCESSED.value + existing_subfolders_and_filename = str(existing_path).rsplit( + existing_share_folder, maxsplit=1 + )[-1] + + if self.is_observation: + return ( + Path(ShareFolder.OBSERVATIONS.value) / existing_subfolders_and_filename + ) + return Path(ShareFolder.RESULTS.value) / existing_subfolders_and_filename + + @staticmethod + def _check_md5sum_consistency( + checksum_md5_file: str, checksum_md5_meta: str + ) -> None: + """Check if the md5sum for the file is equal to the one in the metadata""" + if checksum_md5_file != checksum_md5_meta: + warnings.warn( + "The preprocessed file seem to have been modified since it was " + "initially exported. You are adviced to re-create the preprocessed " + "data to prevent mismatch between the file and its metadata." + ) + + def _get_meta_file(self, objfile: Path, checksum_md5: str) -> meta.File: + """Return a meta.File model with updated paths and checksum_md5""" + relative_path = self._get_relative_export_path(existing_path=objfile) + return meta.File( + absolute_path=self.casepath / relative_path, + relative_path=relative_path, + checksum_md5=checksum_md5, + ) + + def _get_updated_metadata(self, meta_existing: dict, objfile: Path) -> dict: + """ + Update the existing metadata with updated fmu/file/tracklog info: + - The 'fmu' block will be added + - The 'file' block will be updated with new paths. + - The 'tracklog' block will be extended with a new event tagged "merged". + + A simple consistency check will be run to detect if the file has been + modified since it was initially exported. + + Subsequently the final metadata is validated against the schema to ensure + it is ready for sumo upload, before it is returned. + """ + assert self._fmudata is not None + + checksum_md5_file = md5sum(objfile) + if checksum_md5_meta := meta_existing["file"].get("checksum_md5"): + self._check_md5sum_consistency(checksum_md5_file, checksum_md5_meta) + + # remove '_preprocessed' key if present and check truthy state of it + if not meta_existing.pop("_preprocessed", False): + raise ValueError( + "Missing entry '_preprocessed' in the metadata. Only files exported " + "with ExportData(fmu_context='preprocessed') is supported. " + "Please re-export your objects to disk." + ) + + meta_existing["fmu"] = self._fmudata.get_metadata() + meta_existing["file"] = self._get_meta_file(objfile, checksum_md5_file) + + # update the tracklog block + tracklog_entry = generate_meta_tracklog(event="merged") + meta_existing["tracklog"].extend(tracklog_entry) + + try: + # TODO: Would like to use meta.Root.model_validate() here + # but then the '$schema' field is dropped from the meta_existing + return internal.DataClassMeta.model_validate(meta_existing).model_dump( + mode="json", exclude_none=True, by_alias=True + ) + except ValidationError as err: + raise InvalidMetadataError( + f"The existing metadata for the preprocessed file {objfile} is " + "outdated. The files will still be copied to the fmu case but no " + "metadata will be made. Please re-export the preprocessed object to " + "disk to ensure the metadata are following the latest data standards. " + f"Detailed information: \n{str(err)}" + ) from err + + # ================================================================================== + # Public methods: + # ================================================================================== + + def generate_metadata(self, obj: str | Path) -> dict: + """Generate updated metadata for the preprocessed data. + + Returns: + A dictionary with all metadata. + """ + + objfile = self._validate_object(obj) + objmetafile = objfile.parent / f".{objfile.name}.yml" + + if meta_existing := self._read_metadata_file(objmetafile): + return self._get_updated_metadata(meta_existing, objfile) + + raise RuntimeError( + f"Could not detect existing metadata with name {objmetafile}" + ) + + def export(self, obj: str | Path) -> str: + """Re-export preprocessed file with updated metadata + + Returns: + Full path of exported object file. + """ + objfile = self._validate_object(obj) + objmetafile = objfile.parent / f".{objfile.name}.yml" + + outfile = self.casepath / self._get_relative_export_path(existing_path=objfile) + outfile.parent.mkdir(parents=True, exist_ok=True) + + # copy existing file to updated path + shutil.copy(objfile, outfile) + logger.info("Copied input file to: %s", outfile) + + if meta_existing := self._read_metadata_file(objmetafile): + try: + meta_updated = self._get_updated_metadata(meta_existing, objfile) + except InvalidMetadataError as err: + warnings.warn(str(err)) + else: + # store metafile to updated path + metafile = outfile.parent / f".{outfile.name}.yml" + export_metadata_file( + file=metafile, metadata=meta_updated, savefmt="yaml" + ) + logger.info("Updated metadata file is: %s", metafile) + else: + warnings.warn( + f"Could not detect existing metadata with name {objmetafile}. " + f"Input file will be copied to {outfile}, but without metadata." + ) + + return str(outfile) diff --git a/src/fmu/dataio/providers/_filedata.py b/src/fmu/dataio/providers/_filedata.py index 2f62ab580..37d88582a 100644 --- a/src/fmu/dataio/providers/_filedata.py +++ b/src/fmu/dataio/providers/_filedata.py @@ -30,9 +30,9 @@ class ShareFolder(str, Enum): - PREPROCESSED = "share/preprocessed" - OBSERVATIONS = "share/observations" - RESULTS = "share/results" + PREPROCESSED = "share/preprocessed/" + OBSERVATIONS = "share/observations/" + RESULTS = "share/results/" @dataclass diff --git a/src/fmu/dataio/providers/_fmu.py b/src/fmu/dataio/providers/_fmu.py index 6dfe24a77..0eabda6e1 100644 --- a/src/fmu/dataio/providers/_fmu.py +++ b/src/fmu/dataio/providers/_fmu.py @@ -107,7 +107,7 @@ class FmuProvider(Provider): model: dict | None = None fmu_context: FmuContext = FmuContext.REALIZATION - include_ertjobs: bool = True + include_ertjobs: bool = False casepath_proposed: Optional[Path] = None workflow: Optional[Union[str, dict[str, str]]] = None diff --git a/src/fmu/dataio/providers/objectdata/_base.py b/src/fmu/dataio/providers/objectdata/_base.py index 7079257ce..f30f2c281 100644 --- a/src/fmu/dataio/providers/objectdata/_base.py +++ b/src/fmu/dataio/providers/objectdata/_base.py @@ -4,7 +4,6 @@ from copy import deepcopy from dataclasses import dataclass, field from datetime import datetime -from pathlib import Path from typing import TYPE_CHECKING, Any, Final, TypeVar from warnings import warn @@ -345,30 +344,3 @@ def _validate_get_ext(fmt: str, subtype: str, validator: dict[str, V]) -> V: f"The file format {fmt} is not supported. ", f"Valid {subtype} formats are: {list(validator.keys())}", ) - - @classmethod - def from_metadata_dict( - cls, obj: Inferrable, dataio: ExportData, meta_existing: dict - ) -> ObjectDataProvider: - """Instantiate from existing metadata.""" - - relpath = Path(meta_existing["file"]["relative_path"]) - - time0, time1 = None, None - if "time" in meta_existing["data"]: - time0, time1 = get_timedata_from_existing(meta_existing["data"]["time"]) - - return cls( - obj=obj, - dataio=dataio, - metadata=meta_existing["data"], - name=meta_existing["data"]["name"], - classname=meta_existing["class"], - efolder=( - relpath.parent.parent.name if dataio.subfolder else relpath.parent.name - ), - extension=relpath.suffix, - fmt=meta_existing["data"]["format"], - time0=time0, - time1=time1, - ) diff --git a/src/fmu/dataio/providers/objectdata/_provider.py b/src/fmu/dataio/providers/objectdata/_provider.py index 8d6286e34..16376159e 100644 --- a/src/fmu/dataio/providers/objectdata/_provider.py +++ b/src/fmu/dataio/providers/objectdata/_provider.py @@ -132,8 +132,6 @@ def objectdata_provider_factory( NotImplementedError: when receiving an object we don't know how to generated metadata for. """ - if meta_existing: - return ExistingDataProvider.from_metadata_dict(obj, dataio, meta_existing) if isinstance(obj, xtgeo.RegularSurface): return RegularSurfaceDataProvider(obj=obj, dataio=dataio) if isinstance(obj, xtgeo.Polygons): @@ -161,37 +159,6 @@ def objectdata_provider_factory( raise NotImplementedError(f"This data type is not currently supported: {type(obj)}") -@dataclass -class ExistingDataProvider(ObjectDataProvider): - """These getters should never be called because metadata was derived a priori.""" - - obj: Inferrable - - def get_spec(self) -> None: - """Derive data.spec from existing metadata.""" - - def get_bbox(self) -> None: - """Derive data.bbox from existing metadata.""" - - def get_objectdata(self) -> DerivedObjectDescriptor: - """Derive object data for existing metadata.""" - return DerivedObjectDescriptor( - subtype=self.metadata["subtype"], - classname=self.metadata["class"], - layout=self.metadata["layout"], - efolder=self.efolder, - fmt=self.fmt, - extension=self.extension, - spec=self.metadata["spec"], - bbox=self.metadata["bbox"], - table_index=None, - ) - - def derive_metadata(self) -> None: - """Metadata has already been derived for this provider, and is already set from - instantiation, so override this method and do nothing.""" - - @dataclass class DictionaryDataProvider(ObjectDataProvider): obj: dict diff --git a/tests/test_units/test_objectdataprovider_class.py b/tests/test_units/test_objectdataprovider_class.py index be1788e26..a8b7b8762 100644 --- a/tests/test_units/test_objectdataprovider_class.py +++ b/tests/test_units/test_objectdataprovider_class.py @@ -3,8 +3,8 @@ import os from datetime import datetime +import fmu.dataio as dataio import pytest -from fmu.dataio import dataio from fmu.dataio._definitions import ConfigurationError, ValidFormats from fmu.dataio.providers.objectdata._base import ( get_timedata_from_existing, @@ -163,10 +163,8 @@ def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath): os.chdir(fmurun_prehook) casepath = fmurun_prehook - edata = dataio.ExportData( + edata = dataio.ExportPreprocessedData( config=rmsglobalconfig, - fmu_context="case", - content=None, is_observation=True, casepath=casepath, ) diff --git a/tests/test_units/test_preprocessed.py b/tests/test_units/test_preprocessed.py new file mode 100644 index 000000000..94f60bd22 --- /dev/null +++ b/tests/test_units/test_preprocessed.py @@ -0,0 +1,317 @@ +"""Test the dataio re-export of preprocessed data through ExportDataPreprocessed.""" + +import logging +from pathlib import Path + +import fmu.dataio as dataio +import pytest +import yaml +from fmu.dataio import _utils as utils +from fmu.dataio.exceptions import InvalidMetadataError +from fmu.dataio.providers._fmu import ERT_RELATIVE_CASE_METADATA_FILE + +from ..conftest import remove_ert_env, set_ert_env_forward, set_ert_env_prehook + +logger = logging.getLogger(__name__) + +PREPROCESSED_SURFACEPATH = "share/preprocessed/maps/topvolantis--20240802_20200909.gri" + + +def read_metadata(objmetafile): + with open(objmetafile, encoding="utf-8") as stream: + return yaml.safe_load(stream) + + +def export_preprocessed_surface(config, regsurf): + edata = dataio.ExportData( + config=config, + fmu_context="preprocessed", + name="TopVolantis", + content="depth", + timedata=[[20240802, "moni"], [20200909, "base"]], + ) + surfacepath = Path(edata.export(regsurf)) + metafile = surfacepath.parent / f".{surfacepath.name}.yml" + return surfacepath, metafile + + +def test_export_preprocessed_surfacefile( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """ + Test re-exporting a preprocessed surface in a fmu run, and check that the + existing metadata is updated with fmu/file/tracklog information and + the _preprocessed flag is removed. + """ + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + edata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ) + # generate the updated metadata + metadata = edata.generate_metadata(surfacepath) + + # check that _preprocessed is removed + assert "_preprocessed" not in metadata + + # check that the fmu block is added + assert "fmu" in metadata + assert metadata["fmu"]["context"]["stage"] == "case" + assert "realization" not in metadata["fmu"] + + # check that the file paths are updated. The relative_path should be + # equal to the initial export except for the share folder + relative_path = PREPROCESSED_SURFACEPATH.replace("preprocessed", "observations") + absolute_path = fmurun_prehook / relative_path + assert metadata["file"]["relative_path"] == relative_path + assert metadata["file"]["absolute_path"] == str(absolute_path) + + # check that the tracklog contains two events and the last is a "merged" event + assert len(metadata["tracklog"]) == 2 + assert "merged" in metadata["tracklog"][-1]["event"] + + # check that for all other keys the new metadata is equal to the existing + existing_meta = read_metadata(metafile) + for key, value in existing_meta.items(): + if key not in ["fmu", "file", "tracklog", "_preprocessed"]: + assert metadata[key] == value + + # do the actual export and check that both files exists + edata.export(surfacepath) + metafile = absolute_path.parent / f".{absolute_path.name}.yml" + assert absolute_path.exists() + assert metafile.exists() + + +def test_export_to_results_folder( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """ + Test re-exporting a preprocessed surface in a fmu run, and see that it works + storing to the case/share/results folder + """ + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + edata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=False, casepath=fmurun_prehook + ) + + # check that the export has been to the case/share/results folder + relative_path = PREPROCESSED_SURFACEPATH.replace("preprocessed", "results") + + filepath = Path(edata.export(surfacepath)) + assert filepath == fmurun_prehook / relative_path + + metafile = filepath.parent / f".{filepath.name}.yml" + assert metafile.exists() + + +def test_preprocessed_field_removed( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """Test that if present the _preprocessed field is removed from the metadata""" + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # modify existing metadata file to include legacy '_preprocesssed' + metadata = read_metadata(metafile) + metadata["_preprocessed"] = True + utils.export_metadata_file(file=metafile, metadata=metadata, savefmt="yaml") + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + metadata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ).generate_metadata(surfacepath) + + # check that the "_preprocessed" field is not present + assert "_preprocessed" not in metadata + + +def test_outdated_metadata(fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch): + """ + Test that a warning is given when trying to re-export preprocessed data + and the existing metadata is not according to the latest data standard. + Also test that if using generate_metadata directly an error is raised. + """ + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # modify existing metadata file to make it 'outdated' + metadata = read_metadata(metafile) + del metadata["data"] # pretend data was not required before + utils.export_metadata_file(file=metafile, metadata=metadata, savefmt="yaml") + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + + edata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ) + # error should be raised when trying to use the generate_metadata function + with pytest.raises(InvalidMetadataError, match="outdated"): + edata.generate_metadata(surfacepath) + + # warning should be printed when trying to use the export function + with pytest.warns(UserWarning, match="outdated"): + edata.export(surfacepath) + + +def test_export_without_existing_meta( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """ + Test that a warning is raised if metadata is not existing for a file + and that the file is copied anyway + """ + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + + # delete the metafile + metafile.unlink() + edata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ) + # test that error is raised when creating metadata + with pytest.raises(RuntimeError, match="Could not detect existing metadata"): + edata.generate_metadata(surfacepath) + + # test that warning is issued when doing an export + with pytest.warns(UserWarning, match="Could not detect existing metadata"): + filepath = edata.export(surfacepath) + + # check that the file have been copied into the fmu case path + assert Path(filepath).exists() + assert filepath.startswith(str(fmurun_prehook)) + + +def test_preprocessed_surface_modified_post_export( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """ + Test that a warning is raised if the md5sum for the file does not match + the 'file.checksum_md5' in the existing metadata + """ + # mock being outside of FMU and export preprocessed surface + remove_ert_env(monkeypatch) + surfacepath, metafile = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # modify existing metadata file to make the md5sum inconsistent + metadata = read_metadata(metafile) + metadata["file"]["checksum_md5"] = "dummy_modified" + utils.export_metadata_file(file=metafile, metadata=metadata, savefmt="yaml") + + # run the re-export of the preprocessed data inside an mocked FMU run + set_ert_env_prehook(monkeypatch) + + # should issue warning + with pytest.warns(UserWarning, match="seem to have been modified"): + dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ).export(surfacepath) + + +def test_preprocessed_surface_fmucontext_not_case(rmsglobalconfig, monkeypatch): + """ + Test that an error is raised if ExportPreprocessedData is used + in other fmu_context than 'case' + """ + + # error should be raised when outside of FMU + with pytest.raises(RuntimeError, match="Only possible to run re-export"): + dataio.ExportPreprocessedData(config=rmsglobalconfig, casepath="dummy") + + # error should be raised when running on forward_model in FMU + set_ert_env_forward(monkeypatch) + with pytest.raises(RuntimeError, match="Only possible to run re-export"): + dataio.ExportPreprocessedData(config=rmsglobalconfig, casepath="dummy") + + +def test_preprocessed_surface_invalid_casepath(fmurun_prehook, rmsglobalconfig): + """Test that an error is raised if casepath is wrong or no case meta exist""" + + # error should be raised when running on a casepath without case metadata + with pytest.raises(ValueError, match="Could not detect valid case metadata"): + dataio.ExportPreprocessedData(config=rmsglobalconfig, casepath="dummy") + + # shall work when casepath that contains case matadata is provided + dataio.ExportPreprocessedData(config=rmsglobalconfig, casepath=fmurun_prehook) + + # delete the case matadata and see that it fails + metacase_file = fmurun_prehook / ERT_RELATIVE_CASE_METADATA_FILE + metacase_file.unlink() + with pytest.raises(ValueError, match="Could not detect valid case metadata"): + dataio.ExportPreprocessedData(config=rmsglobalconfig, casepath=fmurun_prehook) + + +def test_export_non_preprocessed_data( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """Test that if not exported with fmu_context='preprocessed' error is raised""" + # mock being outside of FMU + remove_ert_env(monkeypatch) + surfacepath = dataio.ExportData( + config=rmsglobalconfig, + fmu_context=None, + name="TopVolantis", + content="depth", + ).export(regsurf) + + assert "share/results" in surfacepath + + # mock being inside of FMU + set_ert_env_prehook(monkeypatch) + + # check that the error is given + with pytest.raises(RuntimeError, match="is not supported"): + dataio.ExportPreprocessedData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ).generate_metadata(surfacepath) + + +def test_export_preprocessed_file_exportdata_futurewarning( + fmurun_prehook, rmsglobalconfig, regsurf, monkeypatch +): + """ + Test that using the ExportData class to export preprocessed files + still works (uses ExportPreprocessedData behind the scene) and + a future warning is issued. + """ + # mock being outside of FMU + remove_ert_env(monkeypatch) + surfacepath, _ = export_preprocessed_surface(rmsglobalconfig, regsurf) + + # mock being inside of FMU + set_ert_env_prehook(monkeypatch) + + # Use the ExportData class instead of the ExportPreprocessedData + edata = dataio.ExportData( + config=rmsglobalconfig, is_observation=True, casepath=fmurun_prehook + ) + + with pytest.warns(FutureWarning, match="no longer supported"): + meta = edata.generate_metadata(surfacepath) + + assert "fmu" in meta + assert "merged" in meta["tracklog"][-1]["event"] + + with pytest.warns(FutureWarning, match="no longer supported"): + filepath = Path(edata.export(surfacepath)) + + assert filepath.exists() + metafile = filepath.parent / f".{filepath.name}.yml" + assert metafile.exists() diff --git a/tests/test_units/test_prerealization_surfaces.py b/tests/test_units/test_prerealization_surfaces.py index 233e08eca..38c6ec2e4 100644 --- a/tests/test_units/test_prerealization_surfaces.py +++ b/tests/test_units/test_prerealization_surfaces.py @@ -13,7 +13,7 @@ import logging import os -import fmu.dataio.dataio as dataio +import fmu.dataio as dataio import pytest from fmu.dataio import _utils as utils @@ -102,10 +102,8 @@ def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath): casepath = fmurun_prehook - edata = dataio.ExportData( + edata = dataio.ExportPreprocessedData( config=rmsglobalconfig, # read from global config - fmu_context="case", - content=None, # shall be accepted without warning here in this context is_observation=True, casepath=casepath, ) @@ -143,45 +141,45 @@ def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath): @pytest.mark.parametrize( - "name_pre, tagname_pre, name_merge, tagname_merge, exproot1, exproot2", + "parent, name, tagname, exproot", [ - ("myname", "", "", "", "myname", "myname"), - ("myname", "", "newname", "", "myname", "newname"), - ("myname", "mytag", "", "", "myname--mytag", "myname--mytag"), - ("myname", "mytag", "newname", "newtag", "myname--mytag", "newname--newtag"), - ("myname", "", "", "newtag", "myname", "myname--newtag"), + ("", "myname", "", "myname"), + ("parent", "myname", "", "parent--myname"), + ("parent", "myname", "mytag", "parent--myname--mytag"), + ("", "myname", "mytag", "myname--mytag"), ], ids=[ - "use both preprocessed name, no tagname", - "change name, no tagname", - "keep initial name and tagname", - "change both name and tagname", - "keep names, not initial tagname, but a merged tagname", + "only name", + "parent and name", + "parent, name and tagname", + "name and tagname", ], ) -def test_regsurf_preprocessed_obs_vary_name_tagname( +def test_regsurf_preprocessed_filename_retained( fmurun_prehook, rmssetup, rmsglobalconfig, regsurf, - name_pre, - tagname_pre, - name_merge, - tagname_merge, - exproot1, - exproot2, + parent, + name, + tagname, + exproot, monkeypatch, ): - """Check that current name and/or tagname are propegated or updated.""" + """ + Check that current name and/or tagname are propegated and + retained when re-exporting preprocessed data. + """ @inside_rms def _export_data_from_rms( rmssetup, rmsglobalconfig, regsurf, - name_pre, - tagname_pre, - exproot1, + parent, + name, + tagname, + exproot, ): """Run an export of a preprocessed surface inside RMS.""" logger.info("Active folder is %s", rmssetup) @@ -191,10 +189,11 @@ def _export_data_from_rms( config=rmsglobalconfig, # read from global config fmu_context="preprocessed", content="depth", + parent=parent, timedata=[[20240802, "moni"], [20200909, "base"]], is_observation=True, - name=name_pre, - tagname=tagname_pre, + name=name, + tagname=tagname, ) metadata = edata.generate_metadata(regsurf) @@ -202,7 +201,7 @@ def _export_data_from_rms( dates = "20240802_20200909" assert ( metadata["file"]["relative_path"] - == f"share/preprocessed/maps/{exproot1}--{dates}.gri" + == f"share/preprocessed/maps/{exproot}--{dates}.gri" ) return edata.export(regsurf) @@ -211,42 +210,34 @@ def _run_case_fmu( fmurun_prehook, rmsglobalconfig, surfacepath, - name_merge, - tagname_merge, - exproot2, + exproot, ): """Run FMU workflow, using the preprocessed data on a subfolder.""" os.chdir(fmurun_prehook) logger.info("Active folder is %s", fmurun_prehook) - edata = dataio.ExportData( + edata = dataio.ExportPreprocessedData( config=rmsglobalconfig, # read from global config - fmu_context="case", - content="depth", - casepath=fmurun_prehook, is_observation=True, - name=name_merge, - tagname=tagname_merge, + casepath=fmurun_prehook, ) prefix = "share/observations/maps" dates = "20240802_20200909" metadata = edata.generate_metadata(surfacepath) - assert metadata["file"]["relative_path"] == f"{prefix}/{exproot2}--{dates}.gri" + assert metadata["file"]["relative_path"] == f"{prefix}/{exproot}--{dates}.gri" remove_ert_env(monkeypatch) mysurf = _export_data_from_rms( - rmssetup, rmsglobalconfig, regsurf, name_pre, tagname_pre, exproot1 + rmssetup, rmsglobalconfig, regsurf, parent, name, tagname, exproot ) set_ert_env_prehook(monkeypatch) _run_case_fmu( fmurun_prehook, rmsglobalconfig, mysurf, - name_merge, - tagname_merge, - exproot2, + exproot, ) @@ -287,32 +278,21 @@ def _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf): return edata.export(regsurf) - def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath, subf=None): + def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath): """Run FMU workflow, using the preprocessed data on a subfolder.""" os.chdir(fmurun_prehook) logger.info("Active folder is %s", fmurun_prehook) - edata = dataio.ExportData( - config=rmsglobalconfig, # read from global config - fmu_context="case", - casepath=fmurun_prehook, - content="depth", - name="pre_v3", - is_observation=True, + edata = dataio.ExportPreprocessedData( + config=rmsglobalconfig, casepath=fmurun_prehook, is_observation=True + ) + metadata = edata.generate_metadata(surfacepath) + # check that the relative path is identical to existing except the share folder + assert ( + metadata["file"]["relative_path"] + == "share/observations/maps/mysub/preprocessedmap--20240802_20200909.gri" ) - if subf is not None: - metadata = edata.generate_metadata(surfacepath, subfolder=subf) - assert ( - metadata["file"]["relative_path"] - == f"share/observations/maps/{subf}/pre_v3--20240802_20200909.gri" - ) - else: - metadata = edata.generate_metadata(surfacepath) - assert ( - metadata["file"]["relative_path"] - == "share/observations/maps/mysub/pre_v3--20240802_20200909.gri" - ) assert "merged" in metadata["tracklog"][-1]["event"] # run two stage process @@ -321,7 +301,6 @@ def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath, subf=None): set_ert_env_prehook(monkeypatch) _run_case_fmu(fmurun_prehook, rmsglobalconfig, mysurf) - _run_case_fmu(fmurun_prehook, rmsglobalconfig, mysurf, subf="xxxx") @inside_rms @@ -406,12 +385,9 @@ def _run_case_fmu(fmurun_prehook, rmsglobalconfig, surfacepath): os.chdir(fmurun_prehook) logger.info("Active folder is %s", fmurun_prehook) - edata = dataio.ExportData( + edata = dataio.ExportPreprocessedData( config=rmsglobalconfig, - fmu_context="case", casepath=fmurun_prehook, - content="depth", - name="MyName", ) metadata = edata.generate_metadata(surfacepath)