From bec764718c6dbc41a5b56cd640004cf4174deb45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Therese=20Natter=C3=B8y?= <61694854+tnatt@users.noreply.github.com> Date: Mon, 22 Apr 2024 09:08:47 +0200 Subject: [PATCH] CLN: Refactor filestem construction (#623) --- src/fmu/dataio/providers/_filedata.py | 79 +++++++++++-------- .../test_units/test_filedataprovider_class.py | 55 +++++++------ 2 files changed, 73 insertions(+), 61 deletions(-) diff --git a/src/fmu/dataio/providers/_filedata.py b/src/fmu/dataio/providers/_filedata.py index 9b74b92bc..142c289fa 100644 --- a/src/fmu/dataio/providers/_filedata.py +++ b/src/fmu/dataio/providers/_filedata.py @@ -6,6 +6,7 @@ from __future__ import annotations +import re from dataclasses import dataclass from enum import Enum from pathlib import Path @@ -110,47 +111,55 @@ def _add_filename_to_path(self, path: Path) -> Path: return (path / stem).with_suffix(path.suffix + self.objdata.extension) def _get_filestem(self) -> str: - """Construct the file""" + """ + Construct the filestem string as a combinaton of various + attributes; parent, name, tagname and time information. + A '--' is used to separate the non-empty components, and a + filestem containing all components will look like this: + filestem = 'parent--name--tagname--time1_time0' + """ if not self.name: raise ValueError("The 'name' entry is missing for constructing a file name") if not self.objdata.time0 and self.objdata.time1: raise ValueError("Not legal: 'time0' is missing while 'time1' is present") - stem = self.name.lower() - if self.dataio.tagname: - stem += "--" + self.dataio.tagname.lower() - if self.dataio.parent: - stem = self.dataio.parent.lower() + "--" + stem - - if self.objdata.time0 and not self.objdata.time1: - stem += "--" + (str(self.objdata.time0)[0:10]).replace("-", "") - - elif self.objdata.time0 and self.objdata.time1: - monitor = (str(self.objdata.time1)[0:10]).replace("-", "") - base = (str(self.objdata.time0)[0:10]).replace("-", "") - if monitor == base: - warn( - "The monitor date and base date are equal", UserWarning - ) # TODO: consider add clocktimes in such cases? - if self.dataio.filename_timedata_reverse: # class variable - stem += "--" + base + "_" + monitor - else: - stem += "--" + monitor + "_" + base - - # remove unwanted characters - stem = stem.replace(".", "_").replace(" ", "_") - - # avoid multiple double underscores - while "__" in stem: - stem = stem.replace("__", "_") - - # treat norwegian special letters - # BUG(?): What about germen letter like "Ü"? - stem = stem.replace("æ", "ae") - stem = stem.replace("ø", "oe") - stem = stem.replace("å", "aa") - return stem.lower() + filestem_order = ( + self.dataio.parent, + self.name, + self.dataio.tagname, + self._get_timepart_for_filename(), + ) + # join non-empty parts with '--' + filestem = "--".join((p for p in filestem_order if p)) + filestem = self._sanitize_the_filestem(filestem) + return filestem.lower() + + def _get_timepart_for_filename(self) -> str: + if self.objdata.time0 is None: + return "" + t0 = self.objdata.time0.strftime("%Y%m%d") + if not self.objdata.time1: + return t0 + t1 = self.objdata.time1.strftime("%Y%m%d") + return "_".join( + (t1, t0) if not self.dataio.filename_timedata_reverse else (t0, t1) + ) + + @staticmethod + def _sanitize_the_filestem(filestem: str) -> str: + """ + Clean up the filestem; remove unwanted characters, treat + norwegian special letters and remove multiple underscores + """ + filestem = ( + filestem.replace(".", "_") + .replace(" ", "_") + .replace("æ", "ae") + .replace("ø", "oe") + .replace("å", "aa") + ) + return re.sub(r"__+", "_", filestem) def _get_forcefolder_if_absolute(self) -> Path | None: if self.dataio.forcefolder.startswith("/"): diff --git a/tests/test_units/test_filedataprovider_class.py b/tests/test_units/test_filedataprovider_class.py index 3529d2a23..8809419b3 100644 --- a/tests/test_units/test_filedataprovider_class.py +++ b/tests/test_units/test_filedataprovider_class.py @@ -2,6 +2,7 @@ import os from copy import deepcopy +from datetime import datetime from pathlib import Path import pytest @@ -21,72 +22,72 @@ "name", "tag", "parent", - "2020-01-01", - "2022-01-02", + datetime.strptime("2020-01-01", "%Y-%m-%d"), + datetime.strptime("2022-01-02", "%Y-%m-%d"), "parent--name--tag--20220102_20200101", ), ( "name", "", "", - "2020-01-01", - "2022-01-02", + datetime.strptime("2020-01-01", "%Y-%m-%d"), + datetime.strptime("2022-01-02", "%Y-%m-%d"), "name--20220102_20200101", ), ( "name", "", "", - "2022-01-02", - "", + datetime.strptime("2022-01-02", "%Y-%m-%d"), + None, "name--20220102", ), ( "name", "", "", - "", - "", + None, + None, "name", ), ( "name", "", "", - 20210101, - 20220102, + datetime.strptime("2021-01-01", "%Y-%m-%d"), + datetime.strptime("2022-01-02", "%Y-%m-%d"), "name--20220102_20210101", ), ( "name with spaces", "", "", - "", - "", + None, + None, "name_with_spaces", ), ( "name with double space", "", "", - "", - "", + None, + None, "name_with_double_space", ), ( "name. some fm", "", "", - "", - "", + None, + None, "name_some_fm", ), ( "name with many .. . spaces", "", "", - "", - "", + None, + None, "name_with_many_spaces", ), ], @@ -128,16 +129,16 @@ def test_get_filestem( "", "tag", "parent", - "2020-01-01", - "2022-01-02", + datetime.strptime("2020-01-01", "%Y-%m-%d"), + datetime.strptime("2022-01-02", "%Y-%m-%d"), "'name' entry is missing", ), ( "name", "tag", "parent", - "", - "2020-01-01", + None, + datetime.strptime("2020-01-01", "%Y-%m-%d"), "'time1' is missing while", ), ], @@ -218,8 +219,10 @@ def test_filedata_provider(regsurf, tmp_path): objdata.name = "name" objdata.efolder = "efolder" objdata.extension = ".ext" - objdata.time0 = "t1" - objdata.time1 = "t2" + t1 = "19000101" + t2 = "20240101" + objdata.time0 = datetime.strptime(t1, "%Y%m%d") + objdata.time1 = datetime.strptime(t2, "%Y%m%d") fdata = FileDataProvider(cfg, objdata) filemeta = fdata.get_metadata() @@ -227,9 +230,9 @@ def test_filedata_provider(regsurf, tmp_path): assert isinstance(filemeta, meta.File) assert ( str(filemeta.relative_path) - == "share/results/efolder/parent--name--tag--t2_t1.ext" + == f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext" ) - absdata = tmp_path / "share/results/efolder/parent--name--tag--t2_t1.ext" + absdata = tmp_path / f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext" assert filemeta.absolute_path == absdata