From e5b1536f7b900a718d38d44edb7e0bca4e81b068 Mon Sep 17 00:00:00 2001
From: Kristen Thyng <kthyng@gmail.com>
Date: Fri, 26 Jul 2024 15:34:50 -0700
Subject: [PATCH] precommit

---
 docs/whats_new.md                             |   2 +-
 .../models/opendrift/utils.py                 | 351 ++++++++++++------
 2 files changed, 245 insertions(+), 108 deletions(-)

diff --git a/docs/whats_new.md b/docs/whats_new.md
index 0d4853d..0039602 100644
--- a/docs/whats_new.md
+++ b/docs/whats_new.md
@@ -6,7 +6,7 @@
 * depth z should be negative! Fixed this in tests.
 * added `start_time_end`, which adds OpenDrift capability for starting drifters over linear time frame
 * fixed so unique log file is output for each simulation even if run in a script, and has the same name as `output_file`.
-* small fix to hist plot
+* small fix to histogram plot
 
 ## v0.8.4 (April 24, 2024)
 
diff --git a/particle_tracking_manager/models/opendrift/utils.py b/particle_tracking_manager/models/opendrift/utils.py
index 382c221..7c4443a 100644
--- a/particle_tracking_manager/models/opendrift/utils.py
+++ b/particle_tracking_manager/models/opendrift/utils.py
@@ -3,13 +3,12 @@
 # also installed ipython, h5py, cf_xarray, pynco
 # copied to /mnt/vault/ciofs/HINDCAST/ciofs_kerchunk_to2012.json
 
-# import kerchunk.hdf
-import fsspec
 from pathlib import Path
+
+import fsspec
+
 from kerchunk.combine import MultiZarrToZarr
-# from kerchunk.df import refs_to_dataframe
-# import ujson
-# import numpy as np
+
 
 def make_ciofs_kerchunk(start, end):
     """_summary_
@@ -28,21 +27,23 @@ def make_ciofs_kerchunk(start, end):
 
     output_dir_single_files = "/home/kristen/projects/kerchunk_setup/ciofs"
 
-    fs2 = fsspec.filesystem('')  # local file system to save final jsons to
+    fs2 = fsspec.filesystem("")  # local file system to save final jsons to
 
     # select the single file Jsons to combine
-    json_list = sorted(fs2.glob(f"{output_dir_single_files}/*.json"))  # combine single json files
-    json_list = [j for j in json_list if Path(j).stem>=start and Path(j).stem<=end]
+    json_list = sorted(
+        fs2.glob(f"{output_dir_single_files}/*.json")
+    )  # combine single json files
+    json_list = [j for j in json_list if Path(j).stem >= start and Path(j).stem <= end]
 
     # Multi-file JSONs
-    # This code uses the output generated above to create a single ensemble dataset, 
+    # This code uses the output generated above to create a single ensemble dataset,
     # with one set of references pointing to all of the chunks in the individual files.
     # `coo_map = {"ocean_time": "cf:ocean_time"}` is necessary so that both the time
     # values and units are read and interpreted instead of just the values.
 
     def fix_fill_values(out):
         """Fix problem when fill_value and scara both equal 0.0.
-        
+
         If the fill value and the scalar value are both 0, nan is filled instead. This fixes that.
         """
 
@@ -52,33 +53,94 @@ def fix_fill_values(out):
         return out
 
     def postprocess(out):
+        """postprocess function to fix fill values"""
         out = fix_fill_values(out)
         return out
 
-
     mzz = MultiZarrToZarr(
         json_list,
         concat_dims=["ocean_time"],
-        identical_dims= ['lat_rho', 'lon_rho', "lon_psi", "lat_psi",
-                        "lat_u", "lon_u", "lat_v", "lon_v", 
-                        "Akk_bak","Akp_bak","Akt_bak","Akv_bak","Cs_r","Cs_w",
-                        "FSobc_in","FSobc_out","Falpha","Fbeta","Fgamma","Lm2CLM",
-                        "Lm3CLM", "LnudgeM2CLM", "LnudgeM3CLM", "LnudgeTCLM",
-                        "LsshCLM", "LtracerCLM", "LtracerSrc", "LuvSrc",
-                        "LwSrc", "M2nudg", "M2obc_in", "M2obc_out", "M3nudg",
-                        "M3obc_in", "M3obc_out", "Tcline", "Tnudg","Tobc_in", "Tobc_out",
-                        "Vstretching", "Vtransform", "Znudg", "Zob", "Zos", "angle",
-                        "dstart", "dt", "dtfast", "el", "f", "gamma2", "grid", "h",
-                        "hc", "mask_psi", "mask_rho", "mask_u", "mask_v", "nHIS", "nRST",
-                        "nSTA", "ndefHIS",  "ndtfast", "ntimes", "pm", "pn", "rdrg", 
-                        "rdrg2", "rho0", "spherical", "theta_b", "theta_s", "xl",
-                        ],
-        coo_map = {"ocean_time": "cf:ocean_time",
-                },
+        identical_dims=[
+            "lat_rho",
+            "lon_rho",
+            "lon_psi",
+            "lat_psi",
+            "lat_u",
+            "lon_u",
+            "lat_v",
+            "lon_v",
+            "Akk_bak",
+            "Akp_bak",
+            "Akt_bak",
+            "Akv_bak",
+            "Cs_r",
+            "Cs_w",
+            "FSobc_in",
+            "FSobc_out",
+            "Falpha",
+            "Fbeta",
+            "Fgamma",
+            "Lm2CLM",
+            "Lm3CLM",
+            "LnudgeM2CLM",
+            "LnudgeM3CLM",
+            "LnudgeTCLM",
+            "LsshCLM",
+            "LtracerCLM",
+            "LtracerSrc",
+            "LuvSrc",
+            "LwSrc",
+            "M2nudg",
+            "M2obc_in",
+            "M2obc_out",
+            "M3nudg",
+            "M3obc_in",
+            "M3obc_out",
+            "Tcline",
+            "Tnudg",
+            "Tobc_in",
+            "Tobc_out",
+            "Vstretching",
+            "Vtransform",
+            "Znudg",
+            "Zob",
+            "Zos",
+            "angle",
+            "dstart",
+            "dt",
+            "dtfast",
+            "el",
+            "f",
+            "gamma2",
+            "grid",
+            "h",
+            "hc",
+            "mask_psi",
+            "mask_rho",
+            "mask_u",
+            "mask_v",
+            "nHIS",
+            "nRST",
+            "nSTA",
+            "ndefHIS",
+            "ndtfast",
+            "ntimes",
+            "pm",
+            "pn",
+            "rdrg",
+            "rdrg2",
+            "rho0",
+            "spherical",
+            "theta_b",
+            "theta_s",
+            "xl",
+        ],
+        coo_map={
+            "ocean_time": "cf:ocean_time",
+        },
         postprocess=postprocess,
     )
 
-
     # to keep in memory
     out = mzz.translate()
 
@@ -91,7 +153,7 @@ def make_nwgoa_kerchunk(start, end):
     Parameters
     ----------
     start, end : str
-        Should be something like "1999-01-02" for YYYY-MM-DD 
+        Should be something like "1999-01-02" for YYYY-MM-DD
 
     Returns
     -------
@@ -102,110 +164,185 @@ def make_nwgoa_kerchunk(start, end):
     # this version of the daily json files has the grid file merged
     output_dir_single_files = "/home/kristen/projects/kerchunk_setup/nwgoa_with_grids"
 
-    fs2 = fsspec.filesystem('')  # local file system to save final jsons to
+    fs2 = fsspec.filesystem("")  # local file system to save final jsons to
 
     # select the single file Jsons to combine
-    json_list = sorted(fs2.glob(f"{output_dir_single_files}/nwgoa*.json"))  # combine single json files
-    json_list = [j for j in json_list if Path(j).stem.split("nwgoa_")[1]>=start and Path(j).stem.split("nwgoa_")[1]<=end]
-
-    # Merge one of the model output files and the grid file as shown here, then
-    # saved the merged file, then used that in the MultiZarr below to get this to work
-    # https://fsspec.github.io/kerchunk/tutorial.html#merging-variables-across-jsons
-    # json_list += fs2.glob(f"{output_dir_single_files}/Cook_Inlet_grid_1.json") # combine single json files
-
-
-
-    # include grid file 
-    # grid_url_classic = Path("/mnt/depot/data/packrat/prod/aoos/nwgoa/Cook_Inlet_grid_1.nc")
-    # outf = out_base / Path(grid_url_classic.stem).with_suffix(".json")
-    # if not outf.exists():
-    #     refs_static = NetCDF3ToZarr(str(grid_url_classic))
-    #     with fs2.open(outf, 'wb') as f:
-    #         f.write(ujson.dumps(refs_static.translate()).encode());
-
-
-    # # I merged one of the model output files and the grid file as shown here, then
-    # # saved the merged file, then used that in the MultiZarr below to get this to work
-    # # https://fsspec.github.io/kerchunk/tutorial.html#merging-variables-across-jsons
-    # json_list = fs2.glob(f"{output_dir_single_files}/nwgoa_1999-01-02.json") # combine single json files
-    # json_list += fs2.glob(f"{output_dir_single_files}/Cook_Inlet_grid_1.json") # combine single json files
-
-    # d = merge_vars(json_list)  # returns dict, as does translate()
-
-
-# # write merged version
-# outf = out_base / pathlib.Path("nwgoa_1999-01-02.json")
-# if not outf.exists():
-#     # refs_static = NetCDF3ToZarr(str(grid_url_classic))
-#     with fs2.open(outf, 'wb') as f:
-#         f.write(ujson.dumps(d).encode());
-
-
-
-    # Multi-file JSONs
-    # This code uses the output generated above to create a single ensemble dataset, 
-    # with one set of references pointing to all of the chunks in the individual files.
-    # `coo_map = {"ocean_time": "cf:ocean_time"}` is necessary so that both the time
-    # values and units are read and interpreted instead of just the values.
-
-
-    # account for double compression 
+    json_list = sorted(
+        fs2.glob(f"{output_dir_single_files}/nwgoa*.json")
+    )  # combine single json files
+    json_list = [
+        j
+        for j in json_list
+        if Path(j).stem.split("nwgoa_")[1] >= start
+        and Path(j).stem.split("nwgoa_")[1] <= end
+    ]
+
+    # account for double compression
     # Look at individual variables in the files to see what needs to be changed with
     # h5dump -d ocean_time -p /mnt/depot/data/packrat/prod/aoos/nwgoa/processed/1999/nwgoa_1999-02-01.nc
     def preprocess(refs):
+        """preprocess function to fix fill values"""
         for k in list(refs):
-            if k.endswith('/.zarray'):
-                refs[k] = refs[k].replace('"filters":[{"elementsize":8,"id":"shuffle"}]',
-                                        '"filters":[{"elementsize":8,"id":"shuffle"},{"id": "zlib", "level":8}]')
-                refs[k] = refs[k].replace('"filters":[{"elementsize":4,"id":"shuffle"}]',
-                                        '"filters":[{"elementsize":4,"id":"shuffle"},{"id": "zlib", "level":8}]')
+            if k.endswith("/.zarray"):
+                refs[k] = refs[k].replace(
+                    '"filters":[{"elementsize":8,"id":"shuffle"}]',
+                    '"filters":[{"elementsize":8,"id":"shuffle"},{"id": "zlib", "level":8}]',
+                )
+                refs[k] = refs[k].replace(
+                    '"filters":[{"elementsize":4,"id":"shuffle"}]',
+                    '"filters":[{"elementsize":4,"id":"shuffle"},{"id": "zlib", "level":8}]',
+                )
         return refs
 
     import zarr
+
     def add_time_attr(out):
+        """add time attributes to the ocean_time variable"""
         out_ = zarr.open(out)
         out_.ocean_time.attrs["axis"] = "T"
         return out
 
     def postprocess(out):
+        """postprocess function to fix fill values"""
         out = add_time_attr(out)
         return out
 
-
     mzz = MultiZarrToZarr(
         json_list,
         concat_dims=["ocean_time"],
-        identical_dims= ['lat_rho', 'lon_rho', "lon_psi", "lat_psi",
-                        "lat_u", "lon_u", "lat_v", "lon_v", 
-                        "Akk_bak","Akp_bak","Akt_bak","Akv_bak","Cs_r","Cs_w",
-                        "FSobc_in","FSobc_out","Falpha","Fbeta","Fgamma","Lm2CLM",
-                        "Lm3CLM", "LnudgeM2CLM", "LnudgeM3CLM", "LnudgeTCLM",
-                        "LsshCLM", "LtracerCLM", "LtracerSrc", "LuvSrc",
-                        "LwSrc", "M2nudg", "M2obc_in", "M2obc_out", "M3nudg",
-                        "M3obc_in", "M3obc_out", "Tcline", "Tnudg","Tobc_in", "Tobc_out",
-                        "Vstretching", "Vtransform", "Znudg", "Zob", "Zos", "angle",
-                        "dstart", "dt", "dtfast", "el", "f", "gamma2", "grid", "h",
-                        "hc", "mask_psi", "mask_rho", "mask_u", "mask_v", "nHIS", "nRST",
-                        "nSTA", "ndefHIS",  "ndtfast", "ntimes", "pm", "pn", "rdrg", 
-                        "rdrg2", "rho0", "spherical", "theta_b", "theta_s", "xl",
-                        "Charnok_alpha", "CrgBan_cw", "JLTS", "JPRJ", "LuvSponge", "P1", "P2", "P3", "P4", 
-    "PLAT", "PLONG", "ROTA", "XOFF", "YOFF", "Zos_hsig_alpha", "depthmax", "depthmin",
-    "dfdy", "dmde", "dndx", "f0", "gls_Kmin", "gls_Pmin", "gls_c1", "gls_c2", 
-    "gls_c3m",  "gls_c3p", "gls_cmu0", "gls_m", "gls_n", "gls_p", "gls_sigk", "gls_sigp",
-    "h_mask", "hraw", "nAVG", "ndefAVG", "nl_visc2", "ntsAVG", "sz_alpha", "wtype_grid",
-    "x_psi", "x_rho", "x_u", "x_v", "y_psi", "y_rho", "y_u", "y_v",
-                        ],
-        coo_map = {"ocean_time": "cf:ocean_time",
-                #    "eta_rho": list(np.arange(1044))
-                },
+        identical_dims=[
+            "lat_rho",
+            "lon_rho",
+            "lon_psi",
+            "lat_psi",
+            "lat_u",
+            "lon_u",
+            "lat_v",
+            "lon_v",
+            "Akk_bak",
+            "Akp_bak",
+            "Akt_bak",
+            "Akv_bak",
+            "Cs_r",
+            "Cs_w",
+            "FSobc_in",
+            "FSobc_out",
+            "Falpha",
+            "Fbeta",
+            "Fgamma",
+            "Lm2CLM",
+            "Lm3CLM",
+            "LnudgeM2CLM",
+            "LnudgeM3CLM",
+            "LnudgeTCLM",
+            "LsshCLM",
+            "LtracerCLM",
+            "LtracerSrc",
+            "LuvSrc",
+            "LwSrc",
+            "M2nudg",
+            "M2obc_in",
+            "M2obc_out",
+            "M3nudg",
+            "M3obc_in",
+            "M3obc_out",
+            "Tcline",
+            "Tnudg",
+            "Tobc_in",
+            "Tobc_out",
+            "Vstretching",
+            "Vtransform",
+            "Znudg",
+            "Zob",
+            "Zos",
+            "angle",
+            "dstart",
+            "dt",
+            "dtfast",
+            "el",
+            "f",
+            "gamma2",
+            "grid",
+            "h",
+            "hc",
+            "mask_psi",
+            "mask_rho",
+            "mask_u",
+            "mask_v",
+            "nHIS",
+            "nRST",
+            "nSTA",
+            "ndefHIS",
+            "ndtfast",
+            "ntimes",
+            "pm",
+            "pn",
+            "rdrg",
+            "rdrg2",
+            "rho0",
+            "spherical",
+            "theta_b",
+            "theta_s",
+            "xl",
+            "Charnok_alpha",
+            "CrgBan_cw",
+            "JLTS",
+            "JPRJ",
+            "LuvSponge",
+            "P1",
+            "P2",
+            "P3",
+            "P4",
+            "PLAT",
+            "PLONG",
+            "ROTA",
+            "XOFF",
+            "YOFF",
+            "Zos_hsig_alpha",
+            "depthmax",
+            "depthmin",
+            "dfdy",
+            "dmde",
+            "dndx",
+            "f0",
+            "gls_Kmin",
+            "gls_Pmin",
+            "gls_c1",
+            "gls_c2",
+            "gls_c3m",
+            "gls_c3p",
+            "gls_cmu0",
+            "gls_m",
+            "gls_n",
+            "gls_p",
+            "gls_sigk",
+            "gls_sigp",
+            "h_mask",
+            "hraw",
+            "nAVG",
+            "ndefAVG",
+            "nl_visc2",
+            "ntsAVG",
+            "sz_alpha",
+            "wtype_grid",
+            "x_psi",
+            "x_rho",
+            "x_u",
+            "x_v",
+            "y_psi",
+            "y_rho",
+            "y_u",
+            "y_v",
+        ],
+        coo_map={
+            "ocean_time": "cf:ocean_time",
+            #    "eta_rho": list(np.arange(1044))
+        },
         preprocess=preprocess,
         postprocess=postprocess,
     )
 
     # to keep in memory
     out = mzz.translate()
-    # import pdb; pdb.set_trace()
-    # merge_vars([out, fs2.glob(f"{output_dir_single_files}/Cook_Inlet_grid_1.json")])
-    # d = merge_vars(json_list)  # returns dict, as does translate()
 
     return out