From 92f98fded9fa00f6c4f5c75d2b75b28771812731 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Thu, 22 Aug 2024 14:40:25 -0700
Subject: [PATCH 01/13] Configuration and Storage options refactoring

---
 echodataflow/stages/echodataflow_trigger.py | 122 ++++----------------
 echodataflow/utils/config_utils.py          |  54 ++++++++-
 2 files changed, 70 insertions(+), 106 deletions(-)

diff --git a/echodataflow/stages/echodataflow_trigger.py b/echodataflow/stages/echodataflow_trigger.py
index 120fec1..30bd0ae 100644
--- a/echodataflow/stages/echodataflow_trigger.py
+++ b/echodataflow/stages/echodataflow_trigger.py
@@ -17,23 +17,19 @@
 import json
 from pathlib import Path
 from typing import Optional, Union
-from fastapi.encoders import jsonable_encoder
 
+from fastapi.encoders import jsonable_encoder
 from prefect import flow
 from prefect.task_runners import SequentialTaskRunner
-from prefect.blocks.core import Block
-from prefect.variables import Variable
 
 from echodataflow.aspects.singleton_echodataflow import Singleton_Echodataflow
 from echodataflow.models.datastore import Dataset
 from echodataflow.models.pipeline import Recipe
 from echodataflow.utils import log_util
-from echodataflow.utils.config_utils import (
-    check_config,
-    extract_config,
-    get_storage_options,
-    load_block,
-)
+from echodataflow.utils.config_utils import (check_config,
+                                             handle_storage_options,
+                                             parse_dynamic_parameters,
+                                             parse_yaml_config)
 
 from .subflows.initialization_flow import init_flow
 
@@ -82,48 +78,15 @@ def echodataflow_trigger(
         print("Pipeline output:", pipeline_output)
     """
 
-    if storage_options:
-        # Check if storage_options is a Block (fsspec storage) and convert it to a dictionary
-        if isinstance(storage_options, Block):
-            storage_options = get_storage_options(storage_options=storage_options)
-        elif isinstance(storage_options, dict) and storage_options.get("block_name"):
-            block = load_block(
-                name=storage_options.get("block_name"), type=storage_options.get("type")
-            )
-            storage_options = get_storage_options(block)
-    else:
-        storage_options = {}
-
-    if isinstance(dataset_config, Path):
-        dataset_config = str(dataset_config)
-    if isinstance(logging_config, Path):
-        logging_config = str(logging_config)
-    if isinstance(pipeline_config, Path):
-        pipeline_config = str(pipeline_config)
+    storage_options = handle_storage_options(storage_options=storage_options)
+    
+    dataset_config_dict = parse_yaml_config(config=dataset_config, storage_options=storage_options)
+    logging_config_dict = parse_yaml_config(config=logging_config, storage_options=storage_options)
+    pipeline_config_dict = parse_yaml_config(config=pipeline_config, storage_options=storage_options)
+    
     if isinstance(json_data_path, Path):
         json_data_path = str(json_data_path)
 
-    if isinstance(dataset_config, str):
-        if not dataset_config.endswith((".yaml", ".yml")):
-            raise ValueError("Configuration file must be a YAML!")
-        dataset_config_dict = extract_config(dataset_config, storage_options)
-    elif isinstance(dataset_config, dict):
-        dataset_config_dict = dataset_config
-
-    if isinstance(pipeline_config, str):
-        if not pipeline_config.endswith((".yaml", ".yml")):
-            raise ValueError("Configuration file must be a YAML!")
-        pipeline_config_dict = extract_config(pipeline_config, storage_options)
-    elif isinstance(pipeline_config, dict):
-        pipeline_config_dict = pipeline_config
-
-    if isinstance(logging_config, str):
-        if not logging_config.endswith((".yaml", ".yml")):
-            raise ValueError("Configuration file must be a YAML!")
-        logging_config_dict = extract_config(logging_config, storage_options)
-    else:
-        logging_config_dict = logging_config
-
     log_util.log(
         msg={
             "msg": f"Dataset Configuration Loaded For This Run",
@@ -144,7 +107,6 @@ def echodataflow_trigger(
         },
         eflogging=dataset_config_dict.get("logging"),
     )
-    print(dataset_config_dict)
 
     log_util.log(
         msg={
@@ -172,62 +134,18 @@ def echodataflow_trigger(
     check_config(dataset_config_dict, pipeline_config_dict)
     pipeline = Recipe(**pipeline_config_dict)
     dataset = Dataset(**dataset_config_dict)
-
-    if options.get("storage_options_override") and not options["storage_options_override"]:
-        storage_options = {}
-    if not storage_options:
-        if dataset.output.storage_options:
-            if not dataset.output.storage_options.anon:
-                block = load_block(
-                    name=dataset.output.storage_options.block_name,
-                    type=dataset.output.storage_options.type,
-                )
-                dataset.output.storage_options_dict = get_storage_options(block)
-            else:
-                dataset.output.storage_options_dict = {"anon": dataset.output.storage_options.anon}
-
-        if dataset.args.storage_options:
-            if not dataset.args.storage_options.anon:
-                block = load_block(
-                    name=dataset.args.storage_options.block_name,
-                    type=dataset.args.storage_options.type,
-                )
-                dataset.args.storage_options_dict = get_storage_options(block)
-            else:
-                dataset.args.storage_options_dict = {"anon": dataset.args.storage_options.anon}
-        if dataset.args.group:
-            if dataset.args.group.storage_options:
-                if not dataset.args.group.storage_options.anon:
-                    block = load_block(
-                        name=dataset.args.group.storage_options.block_name,
-                        type=dataset.args.group.storage_options.type,
-                    )
-                    dataset.args.group.storage_options_dict = get_storage_options(block)
-                else:
-                    dataset.args.group.storage_options_dict = {
-                        "anon": dataset.args.group.storage_options.anon
-                    }
-    else:
+    
+    if options.get("storage_options_override", False):
         dataset.output.storage_options_dict = storage_options
         dataset.args.storage_options_dict = storage_options
         dataset.args.group.storage_options_dict = storage_options
+    else:
+        dataset.output.storage_options_dict = handle_storage_options(storage_options=dataset.output.storage_options)
+        dataset.args.storage_options_dict = handle_storage_options(storage_options=dataset.args.storage_options)
+        dataset.args.group.storage_options_dict = handle_storage_options(storage_options=dataset.args.group.storage_options)
+        
+    edf = Singleton_Echodataflow(log_file=logging_config_dict, pipeline=pipeline, dataset=dataset)
 
-    print("\nInitiliazing Singleton Object")
-    Singleton_Echodataflow(log_file=logging_config_dict, pipeline=pipeline, dataset=dataset)
-
-    if dataset.args.parameters and dataset.args.parameters.file_name and dataset.args.parameters.file_name == "VAR_RUN_NAME":
-        var: Variable = Variable.get("run_name", default=None)
-        if not var:
-            raise ValueError("No variable found for name `run_name`")
-        else:
-            dataset.args.parameters.file_name = var.value
-
-    # Change made to enable dynamic execution using an extension
-    if options and options.get("file_name"):
-        dataset.args.parameters.file_name = options.get("file_name")
+    dataset = parse_dynamic_parameters(dataset, options=options)
 
-    if options and options.get("run_name"):
-        dataset.name = options.get("run_name")
-    
-    print("\nReading Configurations")
     return init_flow(config=dataset, pipeline=pipeline, json_data_path=json_data_path)
\ No newline at end of file
diff --git a/echodataflow/utils/config_utils.py b/echodataflow/utils/config_utils.py
index 0899cc2..b6b442d 100644
--- a/echodataflow/utils/config_utils.py
+++ b/echodataflow/utils/config_utils.py
@@ -33,6 +33,7 @@
 import itertools as it
 import json
 import os
+from pathlib import Path
 import re
 from typing import Any, Coroutine, Dict, List, Literal, Optional, Union
 from zipfile import ZipFile
@@ -44,6 +45,7 @@
 from prefect.filesystems import Block
 from prefect_aws import AwsCredentials
 from prefect_azure import AzureCosmosDbCredentials
+from prefect.variables import Variable
 
 from echodataflow.aspects.echodataflow_aspect import echodataflow
 from echodataflow.models.datastore import Dataset, StorageOptions, StorageType
@@ -538,17 +540,61 @@ def get_storage_options(storage_options: Block = None) -> Dict[str, Any]:
 
 
 def handle_storage_options(storage_options: Optional[Dict] = None) -> Dict:
+    storage_options_dict: Dict[str, Any] = {}
+    
     if storage_options:
         if isinstance(storage_options, Block):
-            return get_storage_options(storage_options=storage_options)
+            storage_options_dict = get_storage_options(storage_options=storage_options)
         elif isinstance(storage_options, dict) and storage_options.get("block_name"):
             block = load_block(
                 name=storage_options.get("block_name"), type=storage_options.get("type")
             )
-            return get_storage_options(block)
+            storage_options_dict = get_storage_options(block)
+        elif isinstance(storage_options, StorageOptions):
+            if not storage_options.anon:
+                block = load_block(
+                    name=storage_options.block_name,
+                    type=storage_options.type,
+                )
+                storage_options_dict = get_storage_options(block)
+            else:
+                storage_options_dict = {"anon": storage_options.anon}
+        else:
+            storage_options_dict = storage_options if storage_options and len(storage_options.keys()) > 0 else {}
+        
+    return storage_options_dict
+
+def parse_yaml_config(config: Union[dict, str, Path], storage_options: Dict[str, Any]) -> Dict:
+    
+    if isinstance(config, Path):
+        config = str(config)
+        
+    if isinstance(config, str):
+        if not config.endswith((".yaml", ".yml")):
+            raise ValueError("Configuration file must be a YAML!")
+        config_dict = extract_config(config, storage_options)
+    elif isinstance(config, dict):
+        return config
+
+    return config_dict
+
+def parse_dynamic_parameters(dataset: Dataset, options: Dict[str, Any]) -> Dataset:
+    if dataset.args.parameters and dataset.args.parameters.file_name and dataset.args.parameters.file_name == "VAR_RUN_NAME":
+        var: Variable = Variable.get("run_name", default=None)
+        if not var:
+            raise ValueError("No variable found for name `run_name`")
         else:
-            return storage_options if storage_options and len(storage_options.keys()) > 0 else {}
-    return {}
+            dataset.args.parameters.file_name = var.value
+
+    # Change made to enable dynamic execution using an extension
+    if options:
+        if options.get("file_name"):
+            dataset.args.parameters.file_name = options.get("file_name")
+    
+        if options.get("run_name"):
+            dataset.name = options.get("run_name")
+            
+    return dataset
 
 def load_block(name: str = None, type: StorageType = None):
     """

From 34400c48950170cfca5eeb3719ba3c2b0f46e430 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Thu, 22 Aug 2024 15:01:38 -0700
Subject: [PATCH 02/13] config accessed before assignment

---
 echodataflow/utils/config_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/echodataflow/utils/config_utils.py b/echodataflow/utils/config_utils.py
index b6b442d..f19d924 100644
--- a/echodataflow/utils/config_utils.py
+++ b/echodataflow/utils/config_utils.py
@@ -572,11 +572,9 @@ def parse_yaml_config(config: Union[dict, str, Path], storage_options: Dict[str,
     if isinstance(config, str):
         if not config.endswith((".yaml", ".yml")):
             raise ValueError("Configuration file must be a YAML!")
-        config_dict = extract_config(config, storage_options)
-    elif isinstance(config, dict):
-        return config
+        config = extract_config(config, storage_options)
 
-    return config_dict
+    return config
 
 def parse_dynamic_parameters(dataset: Dataset, options: Dict[str, Any]) -> Dataset:
     if dataset.args.parameters and dataset.args.parameters.file_name and dataset.args.parameters.file_name == "VAR_RUN_NAME":

From 33daa787caaddf1cbe04a635805d898513381d77 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 13:27:32 -0700
Subject: [PATCH 03/13] Prefect 3.0 Upgrade

---
 deployment/docker_trigger.py                  |   4 +-
 echodataflow/docker_trigger.py                |   4 +-
 echodataflow/extensions/file_monitor.py       |   1 -
 echodataflow/stages/echodataflow_trigger.py   |   6 +-
 .../stages/subflows/initialization_flow.py    | 105 ++----------------
 echodataflow/stages/subflows/slice_store.py   |   1 -
 echodataflow/stages/subflows/write_output.py  |   4 +-
 7 files changed, 18 insertions(+), 107 deletions(-)

diff --git a/deployment/docker_trigger.py b/deployment/docker_trigger.py
index 1729115..4bcd63f 100644
--- a/deployment/docker_trigger.py
+++ b/deployment/docker_trigger.py
@@ -4,10 +4,10 @@
 from echoflow import echoflow_start
 from echoflow.stages.echoflow_trigger import echoflow_trigger
 from prefect import flow
-from prefect.task_runners import SequentialTaskRunner
+from prefect.task_runners import ThreadPoolTaskRunner
 from typing import Any, Dict, Optional, Union
 
-@flow(name="Docker-Trigger", task_runner=SequentialTaskRunner())
+@flow(name="Docker-Trigger", task_runner=ThreadPoolTaskRunner(max_workers=1))
 def docker_trigger(
     dataset_config: Union[dict, str, Path],
     pipeline_config: Union[dict, str, Path],
diff --git a/echodataflow/docker_trigger.py b/echodataflow/docker_trigger.py
index fe259b0..f1c9d67 100644
--- a/echodataflow/docker_trigger.py
+++ b/echodataflow/docker_trigger.py
@@ -1,10 +1,10 @@
 from pathlib import Path
 from echodataflow.stages.echodataflow_trigger import echodataflow_trigger
 from prefect import flow
-from prefect.task_runners import SequentialTaskRunner
+from prefect.task_runners import ThreadPoolTaskRunner
 from typing import Any, Dict, Optional, Union
 
-@flow(name="docker-trigger-latest", task_runner=SequentialTaskRunner())
+@flow(name="docker-trigger-latest", task_runner=ThreadPoolTaskRunner(max_workers=1))
 def docker_trigger(
     dataset_config: Union[dict, str, Path],
     pipeline_config: Union[dict, str, Path],
diff --git a/echodataflow/extensions/file_monitor.py b/echodataflow/extensions/file_monitor.py
index 946d7ef..15dd6ac 100644
--- a/echodataflow/extensions/file_monitor.py
+++ b/echodataflow/extensions/file_monitor.py
@@ -20,7 +20,6 @@
 from echodataflow.models.datastore import StorageType
 from echodataflow.models.run import EDFRun, FileDetails
 from echodataflow.utils.config_utils import glob_url, load_block
-from prefect.task_runners import SequentialTaskRunner
 
 @task
 def execute_flow(
diff --git a/echodataflow/stages/echodataflow_trigger.py b/echodataflow/stages/echodataflow_trigger.py
index 30bd0ae..5ff371d 100644
--- a/echodataflow/stages/echodataflow_trigger.py
+++ b/echodataflow/stages/echodataflow_trigger.py
@@ -20,21 +20,21 @@
 
 from fastapi.encoders import jsonable_encoder
 from prefect import flow
-from prefect.task_runners import SequentialTaskRunner
+from prefect.task_runners import ThreadPoolTaskRunner
 
 from echodataflow.aspects.singleton_echodataflow import Singleton_Echodataflow
 from echodataflow.models.datastore import Dataset
 from echodataflow.models.pipeline import Recipe
 from echodataflow.utils import log_util
 from echodataflow.utils.config_utils import (check_config,
-                                             handle_storage_options,
                                              parse_dynamic_parameters,
                                              parse_yaml_config)
+from echodataflow.utils.filesystem_utils import handle_storage_options
 
 from .subflows.initialization_flow import init_flow
 
 
-@flow(name="Echodataflow", task_runner=SequentialTaskRunner())
+@flow(name="Echodataflow", task_runner=ThreadPoolTaskRunner(max_workers=1))
 def echodataflow_trigger(
     dataset_config: Union[dict, str, Path],
     pipeline_config: Union[dict, str, Path],
diff --git a/echodataflow/stages/subflows/initialization_flow.py b/echodataflow/stages/subflows/initialization_flow.py
index 6601308..db3a9f0 100644
--- a/echodataflow/stages/subflows/initialization_flow.py
+++ b/echodataflow/stages/subflows/initialization_flow.py
@@ -25,7 +25,7 @@
 from distributed import Client, LocalCluster
 from fastapi.encoders import jsonable_encoder
 from prefect import flow
-from prefect.task_runners import SequentialTaskRunner
+from prefect.task_runners import ThreadPoolTaskRunner
 from prefect_dask import DaskTaskRunner
 
 from echodataflow.aspects.echodataflow_aspect import echodataflow
@@ -45,9 +45,10 @@
                                            process_output_groups,
                                            store_json_output)
 from echodataflow.utils.function_utils import dynamic_function_call
+from echodataflow.utils.xr_utils import combine_datasets, fetch_slice_from_store
 
 
-@flow(name="Initialization", task_runner=SequentialTaskRunner())
+@flow(name="Initialization", task_runner=ThreadPoolTaskRunner(max_workers=1))
 @echodataflow(type="FLOW")
 def init_flow(pipeline: Recipe, config: Dataset, json_data_path: Optional[str] = None):
     """
@@ -418,27 +419,16 @@ def get_input_from_store_folder(config: Dataset):
         store_5_output = process_store_folder(config, store_5, end_time)
         
         for name, gr in store_18_output.group.items():
-            
-            edf_18 = gr.data[0]
-            store_18 = xr.open_mfdataset(paths=[ed.out_path for ed in gr.data], engine="zarr",
-                                        combine="by_coords",
-                                        data_vars="minimal",
-                                        coords="minimal",
-                                        compat="override").compute()
-            store_18 = store_18.sel(ping_time=slice(pd.to_datetime(edf_18.start_time, unit="ns"), pd.to_datetime(edf_18.end_time, unit="ns")))            
+            edf_18 = fetch_slice_from_store(edf_group=gr, config=config)
+            store_18 = edf_18.data          
             
             if not store_5_output.group.get(name):
                 raise ValueError(f"No window found in MVBS store (5 channels); window missing -> {name}")
             
-            edf_5 = store_5_output.group[name].data[0]            
-            store_5 = xr.open_mfdataset(paths=[ed.out_path for ed in store_5_output.group[name].data], engine="zarr",
-                                        combine="by_coords",
-                                        data_vars="minimal",
-                                        coords="minimal",
-                                        compat="override").compute()
-            store_5 = store_5.sel(ping_time=slice(pd.to_datetime(edf_5.start_time, unit="ns"), pd.to_datetime(edf_5.end_time, unit="ns")))
+            edf_5 = fetch_slice_from_store(edf_group=store_5_output.group[name], config=config)
+            store_5 = edf_5.data
             
-            edf_5.data, edf_5.data_ref = combine_datasets(store_18, store_5)
+            edf_5.data, edf_5.data_ref = combine_datasets(store_18=store_18, store_5=store_5, config=config)
             
             combo_output.group[name] = gr.model_copy()
             combo_output.group[name].data = [edf_5]
@@ -449,7 +439,7 @@ def get_input_from_store_folder(config: Dataset):
                     eflogging=config.logging,
                 )
 
-            for dim, size in edf_5.data_ref.dims.items():
+            for dim, size in edf_5.data.dims.items():
                 log_util.log(
                     msg={"msg": f"{ dim } : {size}", "mod_name": __file__, "func_name": "Mask"},
                     use_dask=False,
@@ -458,83 +448,6 @@ def get_input_from_store_folder(config: Dataset):
             
         return combo_output
 
-def process_xrd(ds: xr.Dataset, freq_wanted = [120000, 38000, 18000]) -> xr.Dataset:
-    ds = ds.sel(depth=slice(None, 590))        
-        
-    ch_wanted = [int((np.abs(ds["frequency_nominal"]-freq)).argmin()) for freq in freq_wanted]
-    ds = ds.isel(
-                channel=ch_wanted
-            )
-    return ds
-
-def combine_datasets(store_18: xr.Dataset, store_5: xr.Dataset) -> Tuple[torch.Tensor, xr.Dataset]:
-    ds_32k_120k = None
-    ds_18k = None
-    combined_ds = None
-    try:
-        partial_channel_name = ["ES18"]
-        ds_18k = extract_channels(store_18, partial_channel_name)        
-        partial_channel_name = ["ES38", "ES120"]
-        ds_32k_120k = extract_channels(store_5, partial_channel_name)
-    except Exception as e:
-        partial_channel_name = ["ES18"]
-        ds_18k = extract_channels(store_5, partial_channel_name)
-        partial_channel_name = ["ES38", "ES120"]
-        ds_32k_120k = extract_channels(store_18, partial_channel_name)
-        
-    if not ds_18k or not ds_32k_120k:
-        raise ValueError("Could not find the required channels in the datasets")
-    
-    ds_18k = process_xrd(ds_18k, freq_wanted=[18000])
-    ds_32k_120k = process_xrd(ds_32k_120k, freq_wanted=[120000, 38000])
-    
-    combined_ds = xr.merge([ds_18k["Sv"], ds_32k_120k["Sv"], 
-                            ds_18k['latitude'], ds_18k['longitude'],
-                            ds_18k["frequency_nominal"], ds_32k_120k["frequency_nominal"]
-                            ])
-    combined_ds.attrs = ds_18k.attrs
-
-    combined_ds = (
-        combined_ds
-        .transpose("channel", "depth", "ping_time")
-        .sel(depth=slice(None, 590))
-    )
-
-    depth = combined_ds['depth']
-    ping_time = combined_ds['ping_time']
-
-    # Create a tensor with R=120 kHz, G=38 kHz, B=18 kHz mapping
-    red_channel = extract_channels(combined_ds, ["ES120"])
-    green_channel = extract_channels(combined_ds, ["ES38"])
-    blue_channel = extract_channels(combined_ds, ["ES18"])
-
-    tensor = xr.concat([red_channel, green_channel, blue_channel], dim='channel')
-    tensor['channel'] = ['R', 'G', 'B']
-    tensor = tensor.assign_coords({'depth': depth, 'ping_time': ping_time})
-
-    mvbs_tensor = torch.tensor(tensor['Sv'].values, dtype=torch.float32)
-    
-    return (mvbs_tensor, combined_ds)
-
-def extract_channels(dataset: xr.Dataset, partial_names: List[str]) -> xr.Dataset:
-    """
-    Extracts multiple channels data from the given xarray dataset using partial channel names.
-
-    Args:
-        dataset (xr.Dataset): The input xarray dataset containing multiple channels.
-        partial_names (List[str]): The list of partial names of the channels to extract.
-
-    Returns:
-        xr.Dataset: The dataset containing only the specified channels data.
-    """
-    matching_channels = []
-    for partial_name in partial_names:
-        matching_channels.extend([channel for channel in dataset.channel.values if partial_name in str(channel)])
-    
-    if len(matching_channels) == 0:
-        raise ValueError(f"No channels found matching any of '{partial_names}'")
-    
-    return dataset.sel(channel=matching_channels)
 
 def process_store_folder(config: Dataset, store: str, end_time: datetime):
     output: Output = Output()
diff --git a/echodataflow/stages/subflows/slice_store.py b/echodataflow/stages/subflows/slice_store.py
index 4d7f1cc..04f0768 100644
--- a/echodataflow/stages/subflows/slice_store.py
+++ b/echodataflow/stages/subflows/slice_store.py
@@ -19,7 +19,6 @@
 from typing import Dict, Optional
 
 from prefect import flow
-from prefect.task_runners import SequentialTaskRunner
 from prefect.variables import Variable
 import xarray as xr
 import pandas as pd
diff --git a/echodataflow/stages/subflows/write_output.py b/echodataflow/stages/subflows/write_output.py
index 3f596fb..2e7f513 100644
--- a/echodataflow/stages/subflows/write_output.py
+++ b/echodataflow/stages/subflows/write_output.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional
 from prefect import flow
 import xarray as xr
-from prefect.task_runners import SequentialTaskRunner
+from prefect.task_runners import ThreadPoolTaskRunner
 import zarr.sync
 from echodataflow.models.datastore import Dataset
 from echodataflow.models.output_model import ErrorObject, Group
@@ -11,7 +11,7 @@
 from numcodecs import Zlib
 import zarr.storage
 
-@flow(task_runner=SequentialTaskRunner())
+@flow(task_runner=ThreadPoolTaskRunner(max_workers=1))
 def write_output(groups: Dict[str, Group], config: Dataset, stage: Stage, prev_stage: Optional[Stage]):
     log_util.log(
             msg={

From 78008756b4f10384b4d829226f40ab833dc5fb1a Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:00:17 -0700
Subject: [PATCH 04/13] Handle external storage

---
 echodataflow/extensions/file_downloader.py |   4 +-
 echodataflow/extensions/file_monitor.py    |   5 +-
 echodataflow/stages/echodataflow.py        |   6 +-
 echodataflow/utils/config_utils.py         | 148 ++++-----------------
 echodataflow/utils/filesystem_utils.py     | 105 +++++++++++++++
 pyproject.toml                             |   2 +-
 6 files changed, 143 insertions(+), 127 deletions(-)
 create mode 100644 echodataflow/utils/filesystem_utils.py

diff --git a/echodataflow/extensions/file_downloader.py b/echodataflow/extensions/file_downloader.py
index f9a42db..c6d7cb7 100644
--- a/echodataflow/extensions/file_downloader.py
+++ b/echodataflow/extensions/file_downloader.py
@@ -15,10 +15,12 @@
 from prefect.client.schemas.objects import FlowRun, StateType
 from prefect.states import Cancelled
 
-from echodataflow.utils.config_utils import glob_url, handle_storage_options
+from echodataflow.utils.config_utils import glob_url
 from echodataflow.utils.file_utils import extract_fs, make_temp_folder
 import shlex
 
+from echodataflow.utils.filesystem_utils import handle_storage_options
+
 
 @task
 def download_temp_file(file_url: str, storage_options: Dict[str, Any], dest_dir: str, delete_on_transfer: bool, replace: bool) -> str:
diff --git a/echodataflow/extensions/file_monitor.py b/echodataflow/extensions/file_monitor.py
index 15dd6ac..19c5f38 100644
--- a/echodataflow/extensions/file_monitor.py
+++ b/echodataflow/extensions/file_monitor.py
@@ -19,7 +19,8 @@
 
 from echodataflow.models.datastore import StorageType
 from echodataflow.models.run import EDFRun, FileDetails
-from echodataflow.utils.config_utils import glob_url, load_block
+from echodataflow.utils.config_utils import glob_url
+from echodataflow.utils.filesystem_utils import load_block
 
 @task
 def execute_flow(
@@ -111,7 +112,7 @@ def file_monitor(
     
     new_run = datetime.now(tz=timezone.utc).isoformat()
     edfrun: EDFRun = None
-    try:
+    try:        
         edfrun = load_block(
                     name=block_name,
                     type=StorageType.EDFRUN,
diff --git a/echodataflow/stages/echodataflow.py b/echodataflow/stages/echodataflow.py
index b08aca1..ed88e5b 100644
--- a/echodataflow/stages/echodataflow.py
+++ b/echodataflow/stages/echodataflow.py
@@ -50,6 +50,7 @@
 from echodataflow.utils.config_utils import load_block
 
 from echodataflow.stages.echodataflow_trigger import echodataflow_trigger
+from echodataflow.utils.filesystem_utils import handle_storage_options
 
 
 def check_internet_connection(host="8.8.8.8", port=53, timeout=5):
@@ -236,7 +237,7 @@ def echodataflow_start(
 
     # Try loading the Prefect config block
     try:
-        load_block(name="echodataflow-config", type=StorageType.ECHODATAFLOW)
+        handle_storage_options({'block_name':"echodataflow-config", 'type':StorageType.ECHODATAFLOW})        
     except ValueError:
         print(
             "\nNo Prefect Cloud Configuration found. Creating Prefect Local named 'echodataflow-local'. Please add your prefect cloud "
@@ -546,8 +547,7 @@ def load_credential_configuration(sync: bool = False):
                 current_config = asyncio.run(current_config)
             if current_config is not None:
                 for base in current_config.blocks:
-                    block = load_block(base.name, base.type)
-                    block_dict = dict(block)
+                    block_dict = handle_storage_options(base)  
                     block_dict["name"] = base.name
                     block_dict["active"] = base.active
                     block_dict["options"] = json.dumps(base.options)
diff --git a/echodataflow/utils/config_utils.py b/echodataflow/utils/config_utils.py
index f19d924..8b99404 100644
--- a/echodataflow/utils/config_utils.py
+++ b/echodataflow/utils/config_utils.py
@@ -35,27 +35,19 @@
 import os
 from pathlib import Path
 import re
-from typing import Any, Coroutine, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 from zipfile import ZipFile
 
-import nest_asyncio
 import yaml
 from dateutil import parser
 from prefect import task
-from prefect.filesystems import Block
-from prefect_aws import AwsCredentials
-from prefect_azure import AzureCosmosDbCredentials
 from prefect.variables import Variable
 
 from echodataflow.aspects.echodataflow_aspect import echodataflow
-from echodataflow.models.datastore import Dataset, StorageOptions, StorageType
-from echodataflow.models.echodataflow_config import EchodataflowConfig
+from echodataflow.models.datastore import Dataset, StorageOptions
 from echodataflow.models.pipeline import Stage
-from echodataflow.models.run import EDFRun
 from echodataflow.utils.file_utils import extract_fs, isFile
 
-nest_asyncio.apply()
-
 
 @task
 def extract_config(
@@ -511,123 +503,39 @@ def club_raw_files(
             all_files.append(files)
     return all_files
 
-
-def get_storage_options(storage_options: Block = None) -> Dict[str, Any]:
-    """
-    Get storage options from a Block.
-
-    Parameters:
-        storage_options (Block, optional): A block containing storage options.
-
-    Returns:
-        Dict[str, Any]: Dictionary containing storage options.
-
-    Example:
-        aws_credentials = AwsCredentials(...)
-        storage_opts = get_storage_options(aws_credentials)
-    """
-    storage_options_dict: Dict[str, Any] = {}
-    if storage_options is not None:
-        if isinstance(storage_options, AwsCredentials):
-            storage_options_dict["key"] = storage_options.aws_access_key_id
-            storage_options_dict[
-                "secret"
-            ] = storage_options.aws_secret_access_key.get_secret_value()
-            if storage_options.aws_session_token:
-                storage_options_dict["token"] = storage_options.aws_session_token
-
-    return storage_options_dict
-
-
-def handle_storage_options(storage_options: Optional[Dict] = None) -> Dict:
-    storage_options_dict: Dict[str, Any] = {}
-    
-    if storage_options:
-        if isinstance(storage_options, Block):
-            storage_options_dict = get_storage_options(storage_options=storage_options)
-        elif isinstance(storage_options, dict) and storage_options.get("block_name"):
-            block = load_block(
-                name=storage_options.get("block_name"), type=storage_options.get("type")
-            )
-            storage_options_dict = get_storage_options(block)
-        elif isinstance(storage_options, StorageOptions):
-            if not storage_options.anon:
-                block = load_block(
-                    name=storage_options.block_name,
-                    type=storage_options.type,
-                )
-                storage_options_dict = get_storage_options(block)
-            else:
-                storage_options_dict = {"anon": storage_options.anon}
-        else:
-            storage_options_dict = storage_options if storage_options and len(storage_options.keys()) > 0 else {}
-        
-    return storage_options_dict
-
 def parse_yaml_config(config: Union[dict, str, Path], storage_options: Dict[str, Any]) -> Dict:
-    
-    if isinstance(config, Path):
-        config = str(config)
-        
-    if isinstance(config, str):
-        if not config.endswith((".yaml", ".yml")):
-            raise ValueError("Configuration file must be a YAML!")
-        config = extract_config(config, storage_options)
-
+    if isinstance(config, Path) or isinstance(config, str):
+        config = convert_path_to_str(config)
+        validate_yaml_file(config)
+        return extract_config(config, storage_options)
     return config
 
-def parse_dynamic_parameters(dataset: Dataset, options: Dict[str, Any]) -> Dataset:
-    if dataset.args.parameters and dataset.args.parameters.file_name and dataset.args.parameters.file_name == "VAR_RUN_NAME":
-        var: Variable = Variable.get("run_name", default=None)
-        if not var:
-            raise ValueError("No variable found for name `run_name`")
-        else:
-            dataset.args.parameters.file_name = var.value
-
-    # Change made to enable dynamic execution using an extension
-    if options:
-        if options.get("file_name"):
-            dataset.args.parameters.file_name = options.get("file_name")
-    
-        if options.get("run_name"):
-            dataset.name = options.get("run_name")
-            
-    return dataset
-
-def load_block(name: str = None, type: StorageType = None):
-    """
-    Load a block of a specific type by name.
+def convert_path_to_str(config: Union[str, Path]) -> str:
+    return str(config) if isinstance(config, Path) else config
 
-    Parameters:
-        name (str, optional): The name of the block to load.
-        type (StorageType, optional): The type of the block to load.
+def validate_yaml_file(config_str) -> None:
+    if not config_str.endswith((".yaml", ".yml")):
+        raise ValueError("Configuration file must be a YAML!")
 
-    Returns:
-        block: The loaded block.
+def parse_dynamic_parameters(dataset: Dataset, options: Dict[str, Any]) -> Dataset:
+    update_file_name_from_variable(dataset)
+    apply_options_to_dataset(dataset, options)
+    return dataset
 
-    Raises:
-        ValueError: If name or type is not provided.
+def update_file_name_from_variable(dataset: Dataset) -> None:
+    parameters = dataset.args.parameters
+    if parameters.file_name == "VAR_RUN_NAME":
+        run_name_var = Variable.get("run_name", default=None)
+        if not run_name_var:
+            raise ValueError("No variable found for name `run_name`")
+        parameters.file_name = run_name_var.value
 
-    Example:
-        loaded_aws_credentials = load_block(name="my-aws-creds", type=StorageType.AWS)
-    """
-    if name is None or type is None:
-        raise ValueError("Cannot load block without name")
-
-    if type == StorageType.AWS or type == StorageType.AWS.value:
-        coro = AwsCredentials.load(name=name)
-    elif type == StorageType.AZCosmos or type == StorageType.AZCosmos.value:
-        coro = AzureCosmosDbCredentials.load(name=name)
-    elif type == StorageType.ECHODATAFLOW or type == StorageType.ECHODATAFLOW.value:
-        coro = EchodataflowConfig.load(name=name)
-    elif type == StorageType.EDFRUN or type == StorageType.EDFRUN.value:
-        coro = EDFRun.load(name=name)
-
-    if isinstance(coro, Coroutine):
-        block = nest_asyncio.asyncio.run(coro)
-    else:
-        block = coro
-    return block
+def apply_options_to_dataset(dataset: Dataset, options: Dict[str, Any]) -> None:
+    if "file_name" in options:
+        dataset.args.parameters.file_name = options["file_name"]
+    if "run_name" in options:
+        dataset.name = options["run_name"]
+        
 
 def sanitize_external_params(config: Dataset, external_params: Dict[str, Any]):
     """
diff --git a/echodataflow/utils/filesystem_utils.py b/echodataflow/utils/filesystem_utils.py
new file mode 100644
index 0000000..e6bc82d
--- /dev/null
+++ b/echodataflow/utils/filesystem_utils.py
@@ -0,0 +1,105 @@
+from typing import Any, Dict, Optional, Union
+
+import nest_asyncio
+from prefect.filesystems import Block
+from prefect_aws import AwsCredentials
+from prefect_azure import AzureCosmosDbCredentials
+
+from echodataflow.models.datastore import StorageOptions, StorageType
+from echodataflow.models.echodataflow_config import (BaseConfig,
+                                                     EchodataflowConfig)
+from echodataflow.models.run import EDFRun
+
+
+def handle_storage_options(storage_options: Optional[Union[Dict, StorageOptions, Block, BaseConfig]] = None) -> Dict:
+    if isinstance(storage_options, Block):
+        return _handle_block(storage_options)
+    elif isinstance(storage_options, dict):
+        return _handle_dict_options(storage_options)
+    elif isinstance(storage_options, StorageOptions):
+        return _handle_storage_options_class(storage_options)
+    elif isinstance(storage_options, BaseConfig):
+        return _handle_baseconfig_options_class(storage_options)
+    else:
+        return _handle_default(storage_options)
+
+def _handle_block(block: Block) -> Dict:
+    return get_storage_options(storage_options=block)
+
+def _handle_dict_options(options: Dict[str, Any]) -> Dict:
+    if "block_name" in options:
+        block = load_block(name=options["block_name"], type=options.get("type", None))
+        return get_storage_options(block)
+    return options if options else {}
+
+def _handle_storage_options_class(options: StorageOptions) -> Dict:
+    if not options.anon:
+        block = load_block(name=options.block_name, type=options.type)
+        return get_storage_options(block)
+    return {"anon": options.anon}
+
+def _handle_baseconfig_options_class(options: BaseConfig) -> Dict:    
+    block = load_block(name=options.name, type=options.type)
+    return dict(block)
+
+def _handle_default(options: Dict[str, Any]):
+    return options if options else {}
+
+
+def get_storage_options(storage_options: Block = None) -> Dict[str, Any]:
+    """
+    Get storage options from a Block.
+
+    Parameters:
+        storage_options (Block, optional): A block containing storage options.
+
+    Returns:
+        Dict[str, Any]: Dictionary containing storage options.
+
+    Example:
+        aws_credentials = AwsCredentials(...)
+        storage_opts = get_storage_options(aws_credentials)
+    """
+    storage_options_dict: Dict[str, Any] = {}
+    if storage_options is not None:
+        if isinstance(storage_options, AwsCredentials):
+            storage_options_dict["key"] = storage_options.aws_access_key_id
+            storage_options_dict[
+                "secret"
+            ] = storage_options.aws_secret_access_key.get_secret_value()
+            if storage_options.aws_session_token:
+                storage_options_dict["token"] = storage_options.aws_session_token
+
+    return storage_options_dict
+
+def load_block(name: str, stype: StorageType):
+    """
+    Load a block of a specific type by name.
+
+    Parameters:
+        name (str, optional): The name of the block to load.
+        type (StorageType, optional): The type of the block to load.
+
+    Returns:
+        block: The loaded block.
+
+    Raises:
+        ValueError: If name or type is not provided.
+
+    Example:
+        loaded_aws_credentials = load_block(name="my-aws-creds", type=StorageType.AWS)
+    """
+    if name is None or stype is None:
+        raise ValueError("Cannot load block without name or type")
+    
+    loader_map = {
+        StorageType.AWS: AwsCredentials,
+        StorageType.AZCosmos: AzureCosmosDbCredentials,
+        StorageType.ECHODATAFLOW: EchodataflowConfig,
+        StorageType.EDFRUN: EDFRun
+    }
+    
+    if stype in loader_map:
+        return nest_asyncio.asyncio.run(loader_map[stype].load(name=name))
+    else:
+        raise ValueError(f"Unsupported storage type: {stype}")
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index e805c72..fb42a10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ write_to = "echodataflow/version.py"
 line-length = 100
 
 [tool.pytest.ini_options]
-testpaths = ["echodataflow/tests/flow_tests"]
+testpaths = ["echodataflow/tests/flow_tests", "echodataflow/tests/unit"]
 addopts = "--cov=./ --cov-report=term --cov-report=xml"
 filterwarnings = [
     "ignore::DeprecationWarning"

From 57d6f24ba873e94d181674edfb6c4c8e4806aaa3 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:01:11 -0700
Subject: [PATCH 05/13] separation of Util functions

---
 echodataflow/tests/unit/__init__.py |   0
 echodataflow/utils/flow_utils.py    |  57 +++++++++
 echodataflow/utils/xr_utils.py      | 173 ++++++++++++++++++++++++++++
 3 files changed, 230 insertions(+)
 create mode 100644 echodataflow/tests/unit/__init__.py
 create mode 100644 echodataflow/utils/flow_utils.py
 create mode 100644 echodataflow/utils/xr_utils.py

diff --git a/echodataflow/tests/unit/__init__.py b/echodataflow/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/echodataflow/utils/flow_utils.py b/echodataflow/utils/flow_utils.py
new file mode 100644
index 0000000..2e87d58
--- /dev/null
+++ b/echodataflow/utils/flow_utils.py
@@ -0,0 +1,57 @@
+from typing import Dict
+from echodataflow.models.datastore import Dataset
+from echodataflow.models.output_model import Group
+from echodataflow.models.pipeline import Stage
+from echodataflow.utils import log_util
+from pathlib import Path
+import torch
+from echodataflow.utils.xr_utils import fetch_slice_from_store
+from src.model.BinaryHakeModel import BinaryHakeModel
+
+
+def load_model(stage: Stage, config: Dataset):
+    try:
+        log_util.log(
+            msg={"msg": f"Loading model now ---->", "mod_name": __file__, "func_name": "Mask_Prediction"},
+            use_dask=stage.options["use_dask"],
+            eflogging=config.logging,
+        )
+        model_path = f"/home/exouser/hake_data/model/backup_model_weights/binary_hake_model_1.0m_bottom_offset_1.0m_depth_2017_2019_ver_1.ckpt"
+            
+        # Load binary hake models with weights
+        model = BinaryHakeModel("placeholder_experiment_name",
+                                Path("placeholder_score_tensor_dir"),
+                                "placeholder_tensor_log_dir", 0).eval()
+
+        model.load_state_dict(torch.load(
+            stage.external_params.get('model_path', model_path)
+            )["state_dict"])
+        
+        log_util.log(
+            msg={"msg": f"Model loaded succefully", "mod_name": __file__, "func_name": "Mask_Prediction"},
+            use_dask=stage.options["use_dask"],
+            eflogging=config.logging,
+        )
+    except Exception as e:
+        log_util.log(
+            msg={"msg": "", "mod_name": __file__, "func_name": "Mask_Prediction"},
+            use_dask=stage.options["use_dask"],
+            eflogging=config.logging,
+            error=e
+        )
+        raise e
+    
+    return model
+    
+def load_data_in_memory(config: Dataset, groups: Dict[str, Group]):
+    
+    for _, gr in groups.items():
+        # From a store (list of file paths) fetch the slice of data and keep it in memory
+        if gr.metadata and gr.metadata.is_store_folder and len(gr.data) > 0:
+                edf = fetch_slice_from_store(edf_group=gr, config=config, start_time=edf.start_time, end_time=edf.end_time)
+                if edf.data.notnull().any():
+                    gr.data = [edf]
+                    gr.metadata.is_store_folder = False
+                else:
+                    continue
+    return groups
\ No newline at end of file
diff --git a/echodataflow/utils/xr_utils.py b/echodataflow/utils/xr_utils.py
new file mode 100644
index 0000000..84174bd
--- /dev/null
+++ b/echodataflow/utils/xr_utils.py
@@ -0,0 +1,173 @@
+
+
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+from echodataflow.models.datastore import Dataset
+from echodataflow.models.output_model import EchodataflowObject, Group
+import xarray as xr
+import pandas as pd
+import numpy as np
+
+from echodataflow.models.pipeline import Stage
+from echodataflow.utils import log_util
+
+
+def fetch_slice_from_store(edf_group: Group, config: Dataset, options: Dict[str, Any] = None, start_time: str = None, end_time: str = None) -> EchodataflowObject:
+    edf = edf_group.data[0]
+    default_options = {
+                "engine":"zarr",
+                "combine":"by_coords",
+                "data_vars":"minimal",
+                "coords":"minimal",
+                "compat":"override",
+                "storage_options": config.args.storage_options_dict} if options is None else options
+
+    if options:
+        default_options.update(options)
+        
+    if start_time is None:
+        start_time = edf.start_time
+    if end_time is None:
+        end_time = edf.end_time
+    
+    store = xr.open_mfdataset(paths=[ed.out_path for ed in edf_group.data], **default_options).compute()
+    store_slice = store.sel(ping_time=slice(pd.to_datetime(start_time, unit="ns"), pd.to_datetime(end_time, unit="ns")))
+    
+    if store_slice["ping_time"].size == 0:
+        del store
+        del store_slice
+        raise ValueError(f"No data available between {start_time} and {end_time}")
+    
+    del store
+    edf.data = store_slice
+    
+    return edf
+
+def assemble_da(data_array: xr.DataArray, dims: Dict[str, Any]):
+    da = xr.DataArray(
+        data_array, dims=dims.keys()
+    )
+    da = da.assign_coords(dims
+    )
+    return da
+
+def process_xrd(ds: xr.Dataset, freq_wanted = [120000, 38000, 18000]) -> xr.Dataset:
+    ds = ds.sel(depth=slice(None, 590))        
+        
+    ch_wanted = [int((np.abs(ds["frequency_nominal"]-freq)).argmin()) for freq in freq_wanted]
+    ds = ds.isel(
+                channel=ch_wanted
+            )
+    return ds
+
+def combine_datasets(store_18: xr.Dataset, store_5: xr.Dataset, config: Dataset) -> Tuple[xr.Dataset, torch.Tensor]:
+    ds_32k_120k = None
+    ds_18k = None
+    combined_ds = None
+    try:
+        partial_channel_name = ["ES18"]
+        ds_18k = extract_channels(store_18, partial_channel_name)        
+        partial_channel_name = ["ES38", "ES120"]
+        ds_32k_120k = extract_channels(store_5, partial_channel_name)
+    except Exception as e:
+        partial_channel_name = ["ES18"]
+        ds_18k = extract_channels(store_5, partial_channel_name)
+        partial_channel_name = ["ES38", "ES120"]
+        ds_32k_120k = extract_channels(store_18, partial_channel_name)
+        
+    if not ds_18k or not ds_32k_120k:
+        raise ValueError("Could not find the required channels in the datasets")
+    
+    ds_18k = process_xrd(ds_18k, freq_wanted=[18000])
+    ds_32k_120k = process_xrd(ds_32k_120k, freq_wanted=[120000, 38000])
+    
+    combined_ds = xr.merge([ds_18k["Sv"], ds_32k_120k["Sv"], 
+                            ds_18k['latitude'], ds_18k['longitude'],
+                            ds_18k["frequency_nominal"], ds_32k_120k["frequency_nominal"]
+                            ])
+    combined_ds.attrs = ds_18k.attrs
+    
+    
+    return convert_to_tensor(combined_ds=combined_ds, config=config)
+
+def convert_to_tensor(combined_ds: xr.Dataset, config: Dataset, freq_wanted: List[int] = [120000, 38000, 18000]) -> Tuple[xr.Dataset, torch.Tensor]:
+    """
+    Convert dataset to a tensor and return the tensor and the dataset.
+    """
+    
+    ch_wanted = [int((np.abs(combined_ds["frequency_nominal"]-freq)).argmin()) for freq in freq_wanted]
+    
+    log_util.log(
+        msg={"msg": f"Channel order {ch_wanted}", "mod_name": __file__, "func_name": "xr_utils.convert_to_tensor"},
+        use_dask=False,
+        eflogging=config.logging,
+    )
+    
+    depth = combined_ds['depth']
+    ping_time = combined_ds['ping_time']
+
+    # Create a tensor with R=120 kHz, G=38 kHz, B=18 kHz mapping
+    red_channel = extract_channels(combined_ds, ["ES120"])
+    green_channel = extract_channels(combined_ds, ["ES38"])
+    blue_channel = extract_channels(combined_ds, ["ES18"])
+
+    ds = xr.concat([red_channel, green_channel, blue_channel], dim='channel')
+    ds['channel'] = ['R', 'G', 'B']
+    ds = ds.assign_coords({'depth': depth, 'ping_time': ping_time})
+        
+    ds = (
+        ds
+        .transpose("channel", "depth", "ping_time")
+        .isel(channel=ch_wanted)
+    )
+    
+    mvbs_tensor = torch.tensor(ds['Sv'].values, dtype=torch.float32)
+
+    da_MVBS_tensor = torch.clip(
+        mvbs_tensor.clone().detach().to(torch.float16),
+        min=-70,
+        max=-36,
+    )
+    log_util.log(
+        msg={"msg": f"converted and clipped tensor", "mod_name": __file__, "func_name": "xr_utils.convert_to_tensor"},
+        use_dask=False,
+        eflogging=config.logging,
+    )
+    
+    # Replace NaN values with min Sv
+    da_MVBS_tensor[torch.isnan(da_MVBS_tensor)] = -70
+    
+    MVBS_tensor_normalized = (
+        (da_MVBS_tensor - (-70.0)) / (-36.0 - (-70.0)) * 255.0
+    )
+    input_tensor = MVBS_tensor_normalized.unsqueeze(0).float()
+    log_util.log(
+        msg={"msg": f"Normalized tensor", "mod_name": __file__, "func_name": "xr_utils.convert_to_tensor"},
+        use_dask=False,
+        eflogging=config.logging,
+    )
+        
+    return (ds, input_tensor)
+    
+
+def extract_channels(dataset: xr.Dataset, partial_names: List[str]) -> xr.Dataset:
+    """
+    Extracts multiple channels data from the given xarray dataset using partial channel names.
+
+    Args:
+        dataset (xr.Dataset): The input xarray dataset containing multiple channels.
+        partial_names (List[str]): The list of partial names of the channels to extract.
+
+    Returns:
+        xr.Dataset: The dataset containing only the specified channels data.
+    """
+    matching_channels = []
+    for partial_name in partial_names:
+        matching_channels.extend([channel for channel in dataset.channel.values if partial_name in str(channel)])
+    
+    if len(matching_channels) == 0:
+        raise ValueError(f"No channels found matching any of '{partial_names}'")
+    
+    return dataset.sel(channel=matching_channels)
\ No newline at end of file

From b3bb1a57a023805ef010176120dc7f326575da68 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:02:04 -0700
Subject: [PATCH 06/13] Mask prediction refactoring to remove redundant flows

---
 .../stages/subflows/mask_prediction.py        | 392 +++---------------
 1 file changed, 50 insertions(+), 342 deletions(-)

diff --git a/echodataflow/stages/subflows/mask_prediction.py b/echodataflow/stages/subflows/mask_prediction.py
index 8a0ee75..395be54 100644
--- a/echodataflow/stages/subflows/mask_prediction.py
+++ b/echodataflow/stages/subflows/mask_prediction.py
@@ -16,22 +16,22 @@
 Date: August 22, 2023
 """
 from collections import defaultdict
-from pathlib import Path
 from typing import Dict, Optional
 
-from prefect import flow, task
 import torch
 import xarray as xr
-import numpy as np
+from prefect import flow, task
 
-import pandas as pd
 from echodataflow.aspects.echodataflow_aspect import echodataflow
 from echodataflow.models.datastore import Dataset
-from echodataflow.models.output_model import EchodataflowObject, ErrorObject, Group
+from echodataflow.models.output_model import (EchodataflowObject, ErrorObject,
+                                              Group)
 from echodataflow.models.pipeline import Stage
 from echodataflow.utils import log_util
-from echodataflow.utils.file_utils import get_out_zarr, get_working_dir, get_zarr_list, isFile
-from src.model.BinaryHakeModel import BinaryHakeModel
+from echodataflow.utils.file_utils import (get_out_zarr, get_working_dir,
+                                           get_zarr_list, isFile)
+from echodataflow.utils.flow_utils import load_data_in_memory, load_model
+from echodataflow.utils.xr_utils import assemble_da, convert_to_tensor
 
 
 @flow
@@ -68,66 +68,12 @@ def echodataflow_mask_prediction(
 
     futures = defaultdict(list)
 
-    try:
-        log_util.log(
-            msg={"msg": f"Loading model now ---->", "mod_name": __file__, "func_name": "Mask_Prediction"},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-        model_path = f"/home/exouser/hake_data/model/backup_model_weights/binary_hake_model_1.0m_bottom_offset_1.0m_depth_2017_2019_ver_1.ckpt"
-            
-        # Load binary hake models with weights
-        model = BinaryHakeModel("placeholder_experiment_name",
-                                Path("placeholder_score_tensor_dir"),
-                                "placeholder_tensor_log_dir", 0).eval()
-        
-        log_util.log(
-            msg={"msg": f"Loading model at", "mod_name": __file__, "func_name": "Mask_Prediction"},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-
-        log_util.log(
-            msg={"msg": f"{stage.external_params.get('model_path', model_path)}", "mod_name": __file__, "func_name": "Mask_Prediction"},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-
-        model.load_state_dict(torch.load(
-            stage.external_params.get('model_path', model_path)
-            )["state_dict"])
-        
-        log_util.log(
-            msg={"msg": f"Model loaded succefully", "mod_name": __file__, "func_name": "Mask_Prediction"},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-    except Exception as e:
-        log_util.log(
-            msg={"msg": "", "mod_name": __file__, "func_name": "Mask_Prediction"},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-            error=e
-        )
-        raise e
+    model = load_model(stage=stage, config=config)
     
-    for name, gr in groups.items():
-        if gr.metadata and gr.metadata.is_store_folder and len(gr.data) > 0:
-                edf = gr.data[0]
-                store = xr.open_mfdataset(paths=[ed.out_path for ed in gr.data], engine="zarr",
-                                            combine="by_coords",
-                                            data_vars="minimal",
-                                            coords="minimal",
-                                            compat="override").compute()
-                edf.data = store.sel(ping_time=slice(pd.to_datetime(edf.start_time, unit="ns"), pd.to_datetime(edf.end_time, unit="ns")))
-                if edf.data.notnull().any():
-                    gr.data = [edf]
-                else:
-                    continue
-                del store
-                
+    groups = load_data_in_memory(config=config, groups=groups)
+    
+    for name, gr in groups.items():        
         for ed in gr.data:
-            
             gname = ed.out_path.split(".")[0] + ".MaskPrediction"
             new_process = process_mask_prediction.with_options(
                 task_run_name=gname, name=gname, retries=3
@@ -146,6 +92,28 @@ def echodataflow_mask_prediction(
     return groups
 
 
+@task
+@echodataflow()
+def process_mask_prediction_tensor(
+    groups: Dict[str, Group], config: Dataset, stage: Stage, prev_stage: Optional[Stage]
+):
+    working_dir = get_working_dir(stage=stage, config=config)
+    
+    model = load_model(stage=stage, config=config)
+    
+    groups = load_data_in_memory(config=config, groups=groups)
+
+    for name, gr in groups.items():
+        results = []
+        for ed in gr.data:            
+            pmpu = process_mask_prediction_util.with_options(task_run_name=ed.filename)
+            results.append(pmpu.fn(ed, config, stage, working_dir, model))
+            
+        groups[name].data = results
+
+    return groups
+
+
 @task
 @echodataflow()
 def process_mask_prediction(
@@ -221,62 +189,23 @@ def process_mask_prediction(
                 eflogging=config.logging,
             )
 
-            ed_list = get_zarr_list.fn(transect_data=ed, storage_options=config.output.storage_options_dict)
+            mvbs_slice = get_zarr_list.fn(transect_data=ed, storage_options=config.output.storage_options_dict)[0]
 
-            ed_list[0] = ed_list[0].sel(depth=slice(None, 590))
+            mvbs_slice = mvbs_slice.sel(depth=slice(None, 590))
             
             log_util.log(
                 msg={"msg": 'Computing mask_prediction', "mod_name": __file__, "func_name": file_name},
                 use_dask=stage.options["use_dask"],
                 eflogging=config.logging,
             )
-
-            bottom_offset = stage.external_params.get('bottom_offset', 1.0)
-            temperature = stage.external_params.get('temperature', 0.5)
-            freq_wanted = stage.external_params.get('freq_wanted', [120000, 38000, 18000])
-            
-            ch_wanted = [int((np.abs(ed_list[0]["frequency_nominal"]-freq)).argmin()) for freq in freq_wanted]
-
-            log_util.log(
-                msg={"msg": f"Channel order {ch_wanted}", "mod_name": __file__, "func_name": file_name},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-            
-            # Ensure dims sequence is (channel, depth, ping_time)
-            # and channel sequence is 120, 38, 18 kHz
-            mvbs_slice = (
-                ed_list[0]
-                .transpose("channel", "depth", "ping_time")
-                .isel(channel=ch_wanted)
-            )
             
-            mvbs_tensor = torch.tensor(mvbs_slice['Sv'].values, dtype=torch.float32)
+            if ed.data_ref is not None:
+                input_tensor = ed.data_ref
+            else:        
+                mvbs_slice, input_tensor = convert_to_tensor(combined_ds=mvbs_slice, freq_wanted=stage.external_params.get('freq_wanted', [120000, 38000, 18000]), config=config)
 
-            da_MVBS_tensor = torch.clip(
-                mvbs_tensor.clone().detach().to(torch.float16),
-                min=-70,
-                max=-36,
-            )
-            
-            log_util.log(
-                msg={"msg": f"converted and clipped tensor", "mod_name": __file__, "func_name": file_name},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-            # Replace NaN values with min Sv
-            da_MVBS_tensor[torch.isnan(da_MVBS_tensor)] = -70
-            
-            MVBS_tensor_normalized = (
-                (da_MVBS_tensor - (-70.0)) / (-36.0 - (-70.0)) * 255.0
-            )
-            input_tensor = MVBS_tensor_normalized.unsqueeze(0).float()
+            temperature = stage.external_params.get('temperature', 0.5)
             
-            log_util.log(
-                msg={"msg": f"Normalized tensor", "mod_name": __file__, "func_name": file_name},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
             score_tensor = model(input_tensor).detach().squeeze(0)
             
             log_util.log(
@@ -284,8 +213,6 @@ def process_mask_prediction(
                 use_dask=stage.options["use_dask"],
                 eflogging=config.logging,
             )
-
-            # dims = stage.external_params.get('dims', ['ping_time', 'depth'])
             
             dims = {
                 'species': [ "background", "hake"],
@@ -327,28 +254,6 @@ def process_mask_prediction(
                 storage_options=config.output.storage_options_dict,
             )
             
-        else:
-            log_util.log(
-                msg={
-                    "msg": f"Skipped processing {file_name}. File found in the destination folder. To replace or reprocess set `use_offline` flag to False",
-                    "mod_name": __file__,
-                    "func_name": file_name,
-                },
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-
-        log_util.log(
-            msg={"msg": f" ---- Exiting ----", "mod_name": __file__, "func_name": file_name},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-        ed.stages["mask"] = out_zarr
-        ed.error = ErrorObject(errorFlag=False)
-        ed.stages[stage.name] = out_zarr
-        
-        if mvbs_slice:
-            
             slice_zarr = get_out_zarr(
             group=stage.options.get("group", True),
             working_dir=working_dir,
@@ -366,186 +271,13 @@ def process_mask_prediction(
             ed.out_path = slice_zarr
         
             ed.stages[stage.name] = slice_zarr
-
-        ed.data = None
-        del da_mask_hake
-        del da_score_hake
-        del softmax_score_tensor
-        del score_tensor
-        del input_tensor
-        del mvbs_slice
-
-        return ed
-    except Exception as e:
-        log_util.log(
-            msg={"msg": "", "mod_name": __file__, "func_name": file_name},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-            error=e
-        )
-        ed.error = ErrorObject(errorFlag=True, error_desc=str(e))
-        return ed
-
-def assemble_da(data_array, dims):
-    da = xr.DataArray(
-        data_array, dims=dims.keys()
-    )
-    da = da.assign_coords(dims
-    )
-    return da
-
-
-@task
-@echodataflow()
-def process_mask_prediction_tensor(
-    groups: Dict[str, Group], config: Dataset, stage: Stage, prev_stage: Optional[Stage]
-):
-    working_dir = get_working_dir(stage=stage, config=config)
-
-    for name, gr in groups.items():
-        results = []
-        for ed in gr.data:
-            if ed.data is not None:
-                log_util.log(
-                msg={"msg": "ed data is not none", "mod_name": __file__, "func_name": "Mask"},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )                
-            else:
-                log_util.log(
-                msg={"msg": "ed data is none", "mod_name": __file__, "func_name": "Mask"},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-            pmpu = process_mask_prediction_util.with_options(task_run_name=ed.filename)
-            results.append(pmpu.fn(ed, config, stage, working_dir))
-            
-        groups[name].data = results
-
-    return groups
-
-@task
-def process_mask_prediction_util(ed: EchodataflowObject, config: Dataset, stage: Stage, working_dir: str):
-    file_name = ed.filename + "_mask.zarr"
-
-    try:
-        log_util.log(
-            msg={"msg": " ---- Entering ----", "mod_name": __file__, "func_name": file_name},
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-
-        out_zarr = get_out_zarr(
-            group=stage.options.get("group", True),
-            working_dir=working_dir,
-            transect=ed.group_name,
-            file_name=file_name,
-            storage_options=config.output.storage_options_dict,
-        )
-
-        log_util.log(
-            msg={
-                "msg": f"Processing file, output will be at {out_zarr}",
-                "mod_name": __file__,
-                "func_name": file_name,
-            },
-            use_dask=stage.options["use_dask"],
-            eflogging=config.logging,
-        )
-
-        if (
-            stage.options.get("use_offline") == False
-            or isFile(out_zarr, config.output.storage_options_dict) == False
-        ):
-            log_util.log(
-                msg={
-                    "msg": f"File not found in the destination folder / use_offline flag is False",
-                    "mod_name": __file__,
-                    "func_name": file_name,
-                },
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
             
-            log_util.log(
-                msg={"msg": 'Computing mask_prediction', "mod_name": __file__, "func_name": file_name},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-
-            bottom_offset = stage.external_params.get('bottom_offset', 1.0)
-            temperature = stage.external_params.get('temperature', 0.5)
-            
-            model_path = f"/home/exouser/hake_data/model/backup_model_weights/binary_hake_model_{bottom_offset}m_bottom_offset_1.0m_depth_2017_2019_ver_1.ckpt"
-            
-            # Load binary hake models with weights
-            model = BinaryHakeModel("placeholder_experiment_name",
-                                    Path("placeholder_score_tensor_dir"),
-                                    "placeholder_tensor_log_dir", 0).eval()
-            model.load_state_dict(torch.load(
-                stage.external_params.get('model_path', model_path)
-                )["state_dict"])
-            
-
-            mvbs_tensor = ed.data # tensor
-
-            da_MVBS_tensor = torch.clip(
-                mvbs_tensor.clone().detach().to(torch.float16),
-                min=-70,
-                max=-36,
-            )
-            
-            # Replace NaN values with min Sv
-            da_MVBS_tensor[torch.isnan(da_MVBS_tensor)] = -70
-            
-            MVBS_tensor_normalized = (
-                (da_MVBS_tensor - (-70.0)) / (-36.0 - (-70.0)) * 255.0
-            )
-            input_tensor = MVBS_tensor_normalized.unsqueeze(0).float()
-            
-            score_tensor = model(input_tensor).detach().squeeze(0)
-            
-            log_util.log(
-                msg={"msg": f"Converting to Zarr", "mod_name": __file__, "func_name": file_name},
-                use_dask=stage.options["use_dask"],
-                eflogging=config.logging,
-            )
-            
-            dims = {'species': [ "background", "hake"], 'depth': ed.data_ref["depth"].values, 'ping_time': ed.data_ref["ping_time"].values}
-
-            da_score_hake = assemble_da(score_tensor.numpy(), dims=dims)            
-            
-            softmax_score_tensor = torch.nn.functional.softmax(
-                score_tensor / temperature, dim=0
-            )
-            
-            dims.pop('species')
-            da_softmax_hake = assemble_da(softmax_score_tensor.numpy()[1,:,:], dims=dims)
-            
-            da_mask_hake = assemble_da(da_softmax_hake.where(da_softmax_hake > stage.options.get('th_softmax', 0.9)), dims=dims)
-            
-            score_zarr = get_out_zarr(
-                group=True,
-                working_dir=working_dir,
-                transect="Hake_Score",
-                file_name=ed.filename + "_score_hake.zarr",
-                storage_options=config.output.storage_options_dict,
-            )
-            
-            da_score_hake.to_zarr(
-                store=score_zarr,
-                mode="w",
-                consolidated=True,
-                storage_options=config.output.storage_options_dict,
-            ) 
-            
-            # Get mask from score            
-            da_mask_hake.to_zarr(
-                store=out_zarr,
-                mode="w",
-                consolidated=True,
-                storage_options=config.output.storage_options_dict,
-            )
+            del mvbs_slice                        
+            del da_mask_hake
+            del da_score_hake
+            del softmax_score_tensor
+            del score_tensor
+            del input_tensor
             
         else:
             log_util.log(
@@ -566,32 +298,8 @@ def process_mask_prediction_util(ed: EchodataflowObject, config: Dataset, stage:
         ed.stages["mask"] = out_zarr
         ed.error = ErrorObject(errorFlag=False)
         ed.stages[stage.name] = out_zarr
-        ed.data = None
         
-        slice_zarr = get_out_zarr(
-            group=stage.options.get("group", True),
-            working_dir=working_dir,
-            transect=ed.group_name,
-            file_name=ed.filename+"_MVBS_Slice.zarr",
-            storage_options=config.output.storage_options_dict,
-        )
-
-        ed.data_ref.to_zarr(
-                store=slice_zarr,
-                mode="w",
-                consolidated=True,
-                storage_options=config.output.storage_options_dict,
-        )
-        ed.out_path = slice_zarr
-        ed.data_ref = None
-
-        ed.stages[stage.name] = slice_zarr
-
-        del da_mask_hake
-        del da_score_hake
-        del softmax_score_tensor
-        del score_tensor
-        del input_tensor
+                
     except Exception as e:
         log_util.log(
             msg={"msg": "", "mod_name": __file__, "func_name": file_name},
@@ -599,8 +307,8 @@ def process_mask_prediction_util(ed: EchodataflowObject, config: Dataset, stage:
             eflogging=config.logging,
             error=e
         )
-        ed.error = ErrorObject(errorFlag=True, error_desc=str(e))
+        ed.error = ErrorObject(errorFlag=True, error_desc=str(e))        
+    finally:
         ed.data = None
         ed.data_ref = None
-    finally:
-        return ed
+        return ed
\ No newline at end of file

From eae7cf928be29919712396f7d0d6e443dba29e98 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:12:32 -0700
Subject: [PATCH 07/13] Fixed Import Error

---
 echodataflow/stages/echodataflow.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/echodataflow/stages/echodataflow.py b/echodataflow/stages/echodataflow.py
index ed88e5b..372fcd4 100644
--- a/echodataflow/stages/echodataflow.py
+++ b/echodataflow/stages/echodataflow.py
@@ -47,7 +47,6 @@
 )
 
 import echopype as ep
-from echodataflow.utils.config_utils import load_block
 
 from echodataflow.stages.echodataflow_trigger import echodataflow_trigger
 from echodataflow.utils.filesystem_utils import handle_storage_options

From 159c723e14f031a797d1801f3ec9cfdb0c58861f Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:15:36 -0700
Subject: [PATCH 08/13] Fixed Import Errors

---
 echodataflow/__init__.py           | 3 ++-
 echodataflow/utils/config_utils.py | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/echodataflow/__init__.py b/echodataflow/__init__.py
index d02aaaf..cc75207 100644
--- a/echodataflow/__init__.py
+++ b/echodataflow/__init__.py
@@ -7,8 +7,9 @@
                                        echodataflow_create_prefect_profile,
                                        echodataflow_start, get_active_profile,
                                        load_profile)
-from .utils.config_utils import extract_fs, glob_url, load_block
+from .utils.config_utils import extract_fs, glob_url
 from .utils.file_utils import get_ed_list, get_last_run_output, get_zarr_list
+from .utils.filesystem_utils import load_block
 from .docker_trigger import docker_trigger
 
 try:
diff --git a/echodataflow/utils/config_utils.py b/echodataflow/utils/config_utils.py
index 8b99404..19d378b 100644
--- a/echodataflow/utils/config_utils.py
+++ b/echodataflow/utils/config_utils.py
@@ -22,8 +22,6 @@
         raw_url_file: Optional[str] = None,
         json_storage_options: StorageOptions = None
     ) -> List[List[Dict[str, Any]]]
-    get_storage_options(storage_options: Block = None) -> Dict[str, Any]
-    load_block(name: str = None, type: StorageType =  None)
 
 Author: Soham Butala
 Email: sbutala@uw.edu

From e438a74dd72193f6537480a12ecc062ed510d0a3 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:31:07 -0700
Subject: [PATCH 09/13] External FileSystem Unit test

---
 .../tests/unit/TestHandleStorageOptions.py    | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 echodataflow/tests/unit/TestHandleStorageOptions.py

diff --git a/echodataflow/tests/unit/TestHandleStorageOptions.py b/echodataflow/tests/unit/TestHandleStorageOptions.py
new file mode 100644
index 0000000..4e9ee1c
--- /dev/null
+++ b/echodataflow/tests/unit/TestHandleStorageOptions.py
@@ -0,0 +1,60 @@
+import pytest
+from unittest.mock import Mock, patch
+from echodataflow.utils.filesystem_utils import handle_storage_options
+from echodataflow.models.datastore import StorageOptions
+from prefect_aws import AwsCredentials
+
+# Adjust Mocks setup
+Block = Mock(return_value = AwsCredentials(aws_access_key_id="test", aws_secret_access_key='password'))
+MockStorageOptions = Mock(return_value = StorageOptions())
+load_block = Mock()
+
+@pytest.fixture
+def setup_blocks():
+    block = Block()
+    storage_options = MockStorageOptions()
+    return block, storage_options
+
+class TestHandleStorageOptions:
+    def test_none(self):
+        """Test handling with no parameters."""
+        assert handle_storage_options() == {}
+
+    def test_empty_dict(self):
+        """Test handling with an empty dictionary."""
+        assert handle_storage_options({}) == {}
+
+    def test_anon_dict(self):
+        """Test handling with anonymous dictionary."""
+        assert handle_storage_options({'anon': True}) == {'anon': True}
+
+    def test_block(self, setup_blocks):
+        block, _ = setup_blocks
+        expected_dict = {'key': 'test', 'secret': 'password'}
+        assert handle_storage_options(block) == expected_dict        
+
+    def test_anonymous_storage_options(self, setup_blocks):
+        _, storage_options = setup_blocks
+        storage_options.anon = True
+        assert handle_storage_options(storage_options) == {"anon": True}
+
+    @patch('echodataflow.utils.filesystem_utils.load_block')
+    def test_dict_with_block_name(self, mock_load_block):
+        storage_dict = {'block_name': 'echoflow-aws-credentials', 'type': 'AWS'}
+        expected_dict = {'key': 'test', 'secret': 'password'}
+        block = AwsCredentials(aws_access_key_id="test", aws_secret_access_key='password')
+        mock_load_block.return_value = block
+        assert handle_storage_options(storage_dict) == expected_dict
+        mock_load_block.assert_called_with(name="echoflow-aws-credentials", type="AWS")
+    
+    @patch('echodataflow.utils.filesystem_utils.load_block')
+    def test_storage_options(self, mock_load_block, setup_blocks):
+        _, storage_options = setup_blocks
+        storage_options.anon = False
+        storage_options.block_name = "echoflow-aws-credentials"
+        storage_options.type = "AWS"
+        expected_dict = {'key': 'test', 'secret': 'password'}
+        block = AwsCredentials(aws_access_key_id="test", aws_secret_access_key='password')
+        mock_load_block.return_value = block
+        assert handle_storage_options(storage_options) == expected_dict
+        mock_load_block.assert_called_with(name="echoflow-aws-credentials", type="AWS")
\ No newline at end of file

From 5b97200bea8e639cbe5503d08abdf030e0a7ab7c Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Wed, 11 Sep 2024 14:34:17 -0700
Subject: [PATCH 10/13] Parameter name change

---
 echodataflow/extensions/file_monitor.py |  2 +-
 echodataflow/utils/filesystem_utils.py  | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/echodataflow/extensions/file_monitor.py b/echodataflow/extensions/file_monitor.py
index 19c5f38..5e383a4 100644
--- a/echodataflow/extensions/file_monitor.py
+++ b/echodataflow/extensions/file_monitor.py
@@ -115,7 +115,7 @@ def file_monitor(
     try:        
         edfrun = load_block(
                     name=block_name,
-                    type=StorageType.EDFRUN,
+                    stype=StorageType.EDFRUN,
                 )
     except Exception as e:
         print(e)        
diff --git a/echodataflow/utils/filesystem_utils.py b/echodataflow/utils/filesystem_utils.py
index e6bc82d..0b8f10d 100644
--- a/echodataflow/utils/filesystem_utils.py
+++ b/echodataflow/utils/filesystem_utils.py
@@ -28,18 +28,18 @@ def _handle_block(block: Block) -> Dict:
 
 def _handle_dict_options(options: Dict[str, Any]) -> Dict:
     if "block_name" in options:
-        block = load_block(name=options["block_name"], type=options.get("type", None))
+        block = load_block(name=options["block_name"], stype=options.get("type", None))
         return get_storage_options(block)
     return options if options else {}
 
 def _handle_storage_options_class(options: StorageOptions) -> Dict:
     if not options.anon:
-        block = load_block(name=options.block_name, type=options.type)
+        block = load_block(name=options.block_name, stype=options.type)
         return get_storage_options(block)
     return {"anon": options.anon}
 
 def _handle_baseconfig_options_class(options: BaseConfig) -> Dict:    
-    block = load_block(name=options.name, type=options.type)
+    block = load_block(name=options.name, stype=options.type)
     return dict(block)
 
 def _handle_default(options: Dict[str, Any]):
@@ -78,7 +78,7 @@ def load_block(name: str, stype: StorageType):
 
     Parameters:
         name (str, optional): The name of the block to load.
-        type (StorageType, optional): The type of the block to load.
+        stype (StorageType, optional): The type of the block to load.
 
     Returns:
         block: The loaded block.
@@ -87,7 +87,7 @@ def load_block(name: str, stype: StorageType):
         ValueError: If name or type is not provided.
 
     Example:
-        loaded_aws_credentials = load_block(name="my-aws-creds", type=StorageType.AWS)
+        loaded_aws_credentials = load_block(name="my-aws-creds", stype=StorageType.AWS)
     """
     if name is None or stype is None:
         raise ValueError("Cannot load block without name or type")

From d90590740dfad78608c192ea06a027ffe3706853 Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Fri, 13 Sep 2024 14:23:25 -0700
Subject: [PATCH 11/13] EchodataflowConfig Pydantic error fix

---
 echodataflow/models/echodataflow_config.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/echodataflow/models/echodataflow_config.py b/echodataflow/models/echodataflow_config.py
index 1a3790c..9fb302a 100644
--- a/echodataflow/models/echodataflow_config.py
+++ b/echodataflow/models/echodataflow_config.py
@@ -18,11 +18,9 @@
 Email: sbutala@uw.edu
 Date: August 22, 2023
 """
-import json
 from typing import Any, Dict, List, Optional
 
 from prefect.blocks.core import Block
-from pydantic import SecretStr
 
 from .datastore import StorageType
 
@@ -43,9 +41,9 @@ class EchodataflowPrefectConfig(Block):
     class Config:
         arbitrary_types_allowed = True
     
-    prefect_account_id: str = None
-    prefect_api_key: str = None
-    prefect_workspace_id: str = None
+    prefect_account_id: Optional[str] = None
+    prefect_api_key: Optional[str] = None
+    prefect_workspace_id: Optional[str] = None
     profile_name: str = None
 
 

From 22db94c16d7e2deb32d6aad85ec24eb55341909c Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Fri, 13 Sep 2024 14:36:18 -0700
Subject: [PATCH 12/13] Test non-dask flow

---
 echodataflow/tests/flow_tests/MVBS_pipeline.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/echodataflow/tests/flow_tests/MVBS_pipeline.yaml b/echodataflow/tests/flow_tests/MVBS_pipeline.yaml
index ecd073e..4fc67f1 100644
--- a/echodataflow/tests/flow_tests/MVBS_pipeline.yaml
+++ b/echodataflow/tests/flow_tests/MVBS_pipeline.yaml
@@ -1,6 +1,4 @@
 active_recipe: MVBS_pipeline
-use_local_dask: true
-n_workers: 2
 pipeline:
 - recipe_name: MVBS_pipeline
   stages:

From 9c8e67f8926317a3cea3727b49ef4956b875c1cc Mon Sep 17 00:00:00 2001
From: Soham Butala <sohambutala@yahoo.in>
Date: Mon, 16 Sep 2024 09:58:15 -0700
Subject: [PATCH 13/13] File rename

---
 .../{TestHandleStorageOptions.py => test_filesystem_utils.py} | 0
 pyproject.toml                                                | 4 +++-
 2 files changed, 3 insertions(+), 1 deletion(-)
 rename echodataflow/tests/unit/{TestHandleStorageOptions.py => test_filesystem_utils.py} (100%)

diff --git a/echodataflow/tests/unit/TestHandleStorageOptions.py b/echodataflow/tests/unit/test_filesystem_utils.py
similarity index 100%
rename from echodataflow/tests/unit/TestHandleStorageOptions.py
rename to echodataflow/tests/unit/test_filesystem_utils.py
diff --git a/pyproject.toml b/pyproject.toml
index fb42a10..f9468ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,5 +30,7 @@ include_namespace_packages = true
 omit = [
     "*/tests/*",
     "*/__init__.py",
-    "*/docs/*"
+    "*/docs/*",
+    "*/deployment/*",
+    "*/setup.py",
 ]
\ No newline at end of file