Skip to content

Commit

Permalink
MAINT: Split out preprocessed logic into separate class (#660)
Browse files Browse the repository at this point in the history
  • Loading branch information
tnatt authored Jun 10, 2024
1 parent 0c2d040 commit 40a0573
Show file tree
Hide file tree
Showing 13 changed files with 666 additions and 252 deletions.
1 change: 1 addition & 0 deletions src/fmu/dataio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fmu.dataio.dataio import ExportData # noqa # type: ignore
from fmu.dataio.dataio import InitializeCase # noqa # type: ignore
from fmu.dataio.dataio import read_metadata # noqa
from fmu.dataio.preprocessed import ExportPreprocessedData # noqa # type: ignore

try:
from .version import version
Expand Down
45 changes: 9 additions & 36 deletions src/fmu/dataio/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,14 @@
import os
import platform
from datetime import timezone
from pathlib import Path
from typing import TYPE_CHECKING, Final
from typing import TYPE_CHECKING, Final, Literal

from pydantic import AnyHttpUrl, TypeAdapter

from . import types
from ._definitions import SCHEMA, SOURCE, VERSION, FmuContext
from ._logging import null_logger
from ._utils import (
drop_nones,
glue_metadata_preprocessed,
read_metadata_from_file,
)
from ._utils import drop_nones
from .datastructure._internal import internal
from .datastructure.meta import meta
from .exceptions import InvalidMetadataError
Expand All @@ -38,12 +33,14 @@
logger: Final = null_logger(__name__)


def generate_meta_tracklog() -> list[meta.TracklogEvent]:
def generate_meta_tracklog(
event: Literal["created", "merged"] = "created",
) -> list[meta.TracklogEvent]:
"""Initialize the tracklog with the 'created' event only."""
return [
meta.TracklogEvent.model_construct(
datetime=datetime.datetime.now(timezone.utc),
event="created",
event=event,
user=meta.User.model_construct(id=getpass.getuser()),
sysinfo=meta.SystemInformation.model_construct(
fmu_dataio=meta.VersionInformation.model_construct(version=__version__),
Expand Down Expand Up @@ -107,14 +104,6 @@ def _get_meta_display(dataio: ExportData, objdata: ObjectDataProvider) -> meta.D
return meta.Display(name=dataio.display_name or objdata.name)


def _get_meta_preprocessed_info(dataio: ExportData) -> internal.PreprocessedInfo:
return internal.PreprocessedInfo(
name=dataio.name,
tagname=dataio.tagname,
subfolder=dataio.subfolder,
)


def generate_export_metadata(
obj: types.Inferrable,
dataio: ExportData,
Expand Down Expand Up @@ -149,12 +138,7 @@ def generate_export_metadata(
"""

meta_existing = None
if isinstance(obj, (str, Path)) and dataio._reuse_metadata:
logger.info("Partially reuse existing metadata from %s", obj)
meta_existing = read_metadata_from_file(obj)

objdata = objectdata_provider_factory(obj, dataio, meta_existing)
objdata = objectdata_provider_factory(obj, dataio)
masterdata = dataio.config.get("masterdata")

metadata = internal.DataClassMeta(
Expand All @@ -169,18 +153,7 @@ def generate_export_metadata(
file=_get_meta_filedata(dataio, obj, objdata, fmudata, compute_md5),
tracklog=generate_meta_tracklog(),
display=_get_meta_display(dataio, objdata),
preprocessed=(
_get_meta_preprocessed_info(dataio)
if dataio.fmu_context == FmuContext.PREPROCESSED
else None
),
preprocessed=dataio.fmu_context == FmuContext.PREPROCESSED,
).model_dump(mode="json", exclude_none=True, by_alias=True)

if skip_null:
metadata = drop_nones(metadata)

return (
metadata
if not meta_existing
else glue_metadata_preprocessed(oldmeta=meta_existing, newmeta=metadata.copy())
)
return metadata if not skip_null else drop_nones(metadata)
25 changes: 1 addition & 24 deletions src/fmu/dataio/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import hashlib
import json
import os
import shutil
import uuid
from copy import deepcopy
from pathlib import Path
Expand Down Expand Up @@ -114,10 +113,7 @@ def export_file(
) -> str:
"""Export a valid object to file"""

if isinstance(obj, (Path, str)):
# special case when processing data which already has metadata
shutil.copy(obj, filename)
elif filename.suffix == ".gri" and isinstance(obj, xtgeo.RegularSurface):
if filename.suffix == ".gri" and isinstance(obj, xtgeo.RegularSurface):
obj.to_file(filename, fformat="irap_binary")
elif filename.suffix == ".csv" and isinstance(obj, (xtgeo.Polygons, xtgeo.Points)):
out = obj.copy() # to not modify incoming instance!
Expand Down Expand Up @@ -436,22 +432,3 @@ def read_metadata_from_file(filename: str | Path) -> dict:
raise OSError(f"Cannot find requested metafile: {metafile}")
with open(metafilepath) as stream:
return yaml.safe_load(stream)


def glue_metadata_preprocessed(
oldmeta: dict[str, Any], newmeta: dict[str, Any]
) -> dict[str, Any]:
"""Glue (combine) to metadata dicts according to rule 'preprocessed'."""

meta = oldmeta.copy()

if "_preprocessed" in meta:
del meta["_preprocessed"]

meta["fmu"] = newmeta["fmu"]
meta["file"] = newmeta["file"]

newmeta["tracklog"][-1]["event"] = "merged"
meta["tracklog"].extend(newmeta["tracklog"])

return meta
83 changes: 35 additions & 48 deletions src/fmu/dataio/dataio.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .case import InitializeCase
from .datastructure.configuration import global_configuration
from .datastructure.meta import enums
from .preprocessed import ExportPreprocessedData
from .providers._fmu import FmuProvider, get_fmu_context_from_environment

# DATAIO_EXAMPLES: Final = dataio_examples()
Expand All @@ -54,6 +55,17 @@
# ======================================================================================


def _future_warning_preprocessed() -> None:
warnings.warn(
"Using the ExportData class for re-exporting preprocessed data is no "
"longer supported. Use the dedicated ExportPreprocessedData class "
"instead. In a deprecation period the ExportPreprocessedData is used "
"under the hood when a filepath is input to ExportData. "
"Please update your script, as this will be discontinued in the future.",
FutureWarning,
)


def _validate_variable(key: str, value: type, legals: dict[str, str | type]) -> bool:
"""Use data from __annotions__ to validate that overriden var. is of legal type."""
if key not in legals:
Expand Down Expand Up @@ -368,7 +380,6 @@ class ExportData:
_pwd: Path = field(default_factory=Path, init=False)
_config_is_valid: bool = field(default=True, init=False)
_fmurun: bool = field(default=False, init=False)
_reuse_metadata: bool = field(default=False, init=False)

# Need to store these temporarily in variables until we stop
# updating state of the class also on export and generate_metadata
Expand All @@ -378,9 +389,6 @@ class ExportData:
# << NB! storing ACTUAL casepath:
_rootpath: Path = field(default_factory=Path, init=False)

# in some cases input object may change class; store the internal variable here:
_object: types.Inferrable = field(init=False)

def __post_init__(self) -> None:
logger.info("Running __post_init__ ExportData")
logger.debug("Global config is %s", prettyprint_dict(self.config))
Expand Down Expand Up @@ -694,37 +702,6 @@ def _establish_rootpath(self) -> Path:
logger.info("Running outside FMU context, using pwd as roothpath")
return self._pwd

def _check_process_object(self, obj: types.Inferrable) -> None:
"""When obj is file-like, it must be checked + assume preprocessed.
In addition, if preprocessed, derive the name, tagname, subfolder if present and
those are not set already.
For all cases, tie incoming obj to self._object
"""

if isinstance(obj, (str, Path)):
obj = Path(obj)
if not obj.exists():
raise ValidationError(f"The file {obj} does not exist.")

self._reuse_metadata = True

currentmeta = read_metadata(obj)
if "_preprocessed" not in currentmeta:
raise ValidationError(
"The special entry for preprocessed data <_preprocessed> is"
"missing in the metadata. A possible solution is to rerun the"
"preprocessed export."
)
preprocessed = currentmeta["_preprocessed"]

self.name = self.name or preprocessed.get("name", "")
self.tagname = self.tagname or preprocessed.get("tagname", "")
self.subfolder = self.subfolder or preprocessed.get("subfolder", "")

self._object = obj

def _get_fmu_provider(self) -> FmuProvider:
assert isinstance(self.fmu_context, FmuContext)
return FmuProvider(
Expand Down Expand Up @@ -773,16 +750,22 @@ def generate_metadata(

self._update_check_settings(kwargs)

if isinstance(obj, (str, Path)):
assert self.casepath is not None
_future_warning_preprocessed()
return ExportPreprocessedData(
config=self.config,
casepath=self.casepath,
is_observation=self.is_observation,
).generate_metadata(obj)

self._classification = self._get_classification()
self._rep_include = self._get_rep_include()

self._check_process_object(obj) # obj --> self._object
self._update_fmt_flag()
fmudata = self._get_fmu_provider() if self._fmurun else None
# TODO: refactor the argument list for generate_export_metadata; we do not need
# both self._object and self...
self._metadata = generate_export_metadata(
self._object, self, fmudata, compute_md5=compute_md5
obj, self, fmudata, compute_md5=compute_md5
)

logger.info("The metadata are now ready!")
Expand Down Expand Up @@ -816,18 +799,24 @@ def export(
warnings.warn(
"The return_symlink option is deprecated and can safely be removed."
)
self.generate_metadata(obj, compute_md5=True, **kwargs)
metadata = self._metadata
logger.info("Object type is: %s", type(self._object)) # from generate_metadata
if isinstance(obj, (str, Path)):
assert self.casepath is not None
_future_warning_preprocessed()
return ExportPreprocessedData(
config=self.config,
casepath=self.casepath,
is_observation=self.is_observation,
).export(obj)

metadata = self.generate_metadata(obj, compute_md5=True, **kwargs)
logger.info("Object type is: %s", type(obj))

outfile = Path(metadata["file"]["absolute_path"])
# create output folders if they don't exist
outfile.parent.mkdir(parents=True, exist_ok=True)
metafile = outfile.parent / ("." + str(outfile.name) + ".yml")
metafile = outfile.parent / f".{outfile.name}.yml"

logger.info("Export to file using flag: <%s>", self._usefmtflag)
# md5sum is already present in the metadata
export_file(self._object, outfile, flag=self._usefmtflag)
export_file(obj, outfile, flag=self._usefmtflag)
logger.info("Actual file is: %s", outfile)

if self._config_is_valid:
Expand All @@ -836,6 +825,4 @@ def export(
else:
warnings.warn("Data will be exported, but without metadata.", UserWarning)

self._metadata = metadata

return str(outfile)
8 changes: 1 addition & 7 deletions src/fmu/dataio/datastructure/_internal/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,6 @@ class FMUModel(BaseModel):
case: meta.FMUCase


class PreprocessedInfo(BaseModel):
name: str
tagname: str
subfolder: str


class Context(BaseModel, use_enum_values=True):
stage: FmuContext

Expand Down Expand Up @@ -172,7 +166,7 @@ class DataClassMeta(JsonSchemaMetadata):
file: meta.File
display: meta.Display
tracklog: List[meta.TracklogEvent]
preprocessed: Optional[PreprocessedInfo] = Field(alias="_preprocessed")
preprocessed: Optional[bool] = Field(alias="_preprocessed", default=None)


class CaseSchema(JsonSchemaMetadata):
Expand Down
Loading

0 comments on commit 40a0573

Please sign in to comment.