Skip to content

Commit

Permalink
Merge pull request #97 from Sage-Bionetworks/bwmac/IBCDPE-527/syn_req…
Browse files Browse the repository at this point in the history
…uired

[IBCDPE-527] Makes `syn` always required
  • Loading branch information
BWMac authored Nov 22, 2023
2 parents 47dc9c9 + db265af commit 8112d98
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 57 deletions.
13 changes: 7 additions & 6 deletions src/agoradatatools/etl/extract.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from . import utils
import pandas as pd

import synapseclient

def get_entity_as_df(syn_id: str, source: str, syn=None) -> pd.DataFrame:

def get_entity_as_df(
syn_id: str, source: str, syn: synapseclient.Synapse
) -> pd.DataFrame:
"""
1. Creates and logs into synapseclient session (if not provided)
2. Gets synapse entity from id string (and version number if provided)
Expand All @@ -11,13 +14,11 @@ def get_entity_as_df(syn_id: str, source: str, syn=None) -> pd.DataFrame:
Args:
syn_id (str): Synapse ID of entity to be loaded to df
source (str): the source of the data to be loaded to df
syn (synapseclient.Synapse, optional): synapseclient.Synapse session. Defaults to None.
syn (synapseclient.Synapse): synapseclient.Synapse session object.
Returns:
pd.DataFrame: data frame generated from data source provided
"""
if syn is None:
syn = utils._login_to_synapse()

syn_id_version = syn_id.split(".")
synapse_id = syn_id_version[0]
Expand Down Expand Up @@ -92,7 +93,7 @@ def read_tsv_into_df(tsv_path: str) -> pd.DataFrame:
return pd.read_csv(tsv_path, sep="\t")


def read_table_into_df(table_id: str, syn) -> pd.DataFrame:
def read_table_into_df(table_id: str, syn: synapseclient.Synapse) -> pd.DataFrame:
"""
Reads a Synapse table into a dataframe.
Expand Down
10 changes: 2 additions & 8 deletions src/agoradatatools/etl/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import pandas as pd
from synapseclient import Activity, File, Synapse

from . import utils


class NumpyEncoder(json.JSONEncoder):
"""Special json encoder for numpy types"""
Expand Down Expand Up @@ -83,9 +81,7 @@ def remove_non_values(d: dict) -> dict:
return cleaned_dict


def load(
file_path: str, provenance: list, destination: str, syn: Synapse = None
) -> tuple:
def load(file_path: str, provenance: list, destination: str, syn: Synapse) -> tuple:
"""Reads file to be loaded into Synapse
:param syn: synapse object
:return: synapse id of the file loaded into Synapse. Returns None if it
Expand All @@ -95,14 +91,12 @@ def load(
file_path (str): Path of the file to be loaded into Synapse
provenance (list): Array of files that originate the one being loaded
destination (str): Location where the file should be loaded in Synapse
syn (synapseclient.Synapse, optional): synapseclient session. Defaults to None.
syn (synapseclient.Synapse): synapseclient session.
Returns:
tuple: Returns a tuple of the name fo the file and the version number.
"""

if syn is None:
syn = utils._login_to_synapse()
activity = Activity(used=provenance)
file = File(file_path, parent=destination)
file = syn.store(file, activity=activity, forceVersion=False)
Expand Down
23 changes: 11 additions & 12 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,14 @@ def process_dataset(
return syn_obj


def create_data_manifest(parent=None, syn=None) -> DataFrame:
def create_data_manifest(
syn: synapseclient.Synapse, parent: synapseclient.Folder = None
) -> DataFrame:
"""Creates data manifest (dataframe) that has the IDs and version numbers of child synapse folders
Args:
syn (synapseclient.Synapse): Synapse client session.
parent (synapseclient.Folder/str, optional): synapse folder or synapse id pointing to parent synapse folder. Defaults to None.
syn (synapseclient.Synapse, optional): Synapse client session. Defaults to None.
Returns:
DataFrame: Dataframe containing IDs and version numbers of folders within the parent directory
Expand All @@ -151,9 +153,6 @@ def create_data_manifest(parent=None, syn=None) -> DataFrame:
if not parent:
return None

if not syn:
syn = utils._login_to_synapse()

folders = syn.getChildren(parent)
folder = [folders]
folder = [
Expand All @@ -164,17 +163,17 @@ def create_data_manifest(parent=None, syn=None) -> DataFrame:


@log_time(func_name="process_all_files", logger=logger)
def process_all_files(config_path: str = None, syn=None):
def process_all_files(
syn: synapseclient.Synapse,
config_path: str = None,
):
"""This function will read through the entire configuration and process each file listed.
Args:
syn (synapseclient.Session): Synapse client session
config_path (str, optional): path to configuration file. Defaults to None.
syn (synapseclient.Session, optional): Synapse client session. Defaults to None.
"""

# if not syn:
# syn = utils._login_to_synapse()

if config_path:
config = utils._get_config(config_path=config_path)
else:
Expand Down Expand Up @@ -203,7 +202,7 @@ def process_all_files(config_path: str = None, syn=None):

if not error_list:
# create manifest if there are no errors
manifest_df = create_data_manifest(parent=destination, syn=syn)
manifest_df = create_data_manifest(syn=syn, parent=destination)
manifest_path = load.df_to_csv(
df=manifest_df, staging_path=staging_path, filename="data_manifest.csv"
)
Expand Down Expand Up @@ -240,7 +239,7 @@ def process(
auth_token: str = synapse_auth_opt,
):
syn = utils._login_to_synapse(token=auth_token)
process_all_files(config_path=config_path, syn=syn)
process_all_files(syn=syn, config_path=config_path)


if __name__ == "__main__":
Expand Down
12 changes: 1 addition & 11 deletions tests/load/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,7 @@ def setup_method(self, syn):
def teardown_method(self):
mock.patch.stopall()

def test_load_syn_is_none(self):
test_tuple = load.load(
file_path="fake/path/to/fake/file",
provenance=["syn1111111", "syn1111112"],
destination="syn1111113",
syn=None,
)
self.patch_syn_login.assert_called_once()
assert test_tuple == ("syn1111114", 1)

def test_load_syn_is_not_none(self, syn):
def test_load(self, syn):
test_tuple = load.load(
file_path="fake/path/to/fake/file",
provenance=["syn1111111", "syn1111112"],
Expand Down
11 changes: 1 addition & 10 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,6 @@ def test_read_json_into_df():
assert isinstance(df, pd.DataFrame)


# test if utils._login_to_synapse is called when syn=None
def test_get_entity_as_df_syn_is_none(syn):
with patch.object(
utils, "_login_to_synapse", return_value=syn
) as patch_login_to_synapse:
extract.get_entity_as_df(syn_id="syn11111111", source="table", syn=None)
patch_login_to_synapse.assert_called_once()


@pytest.mark.parametrize(
"syn_id, version", [("syn1111111", None), ("syn1111111.1", "1")]
)
Expand All @@ -113,7 +104,7 @@ def test_get_entity_as_df_with_version(syn, syn_id, version):


# test raise if is not supported
def test_get_entity_as_df__not_supported(syn):
def test_get_entity_as_df_not_supported(syn):
with pytest.raises(ValueError, match="File type not *"):
extract.get_entity_as_df(syn_id="syn1111111", source="abc", syn=syn)

Expand Down
16 changes: 6 additions & 10 deletions tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,11 @@ def teardown_method(self):
mock.patch.stopall()

def test_create_data_manifest_parent_none(self, syn: Any):
assert process.create_data_manifest(parent=None, syn=syn) is None
assert process.create_data_manifest(syn=syn, parent=None) is None
self.patch_syn_login.assert_not_called()

def test_create_data_manifest_syn_none(self):
process.create_data_manifest(parent="syn1111111", syn=None)
self.patch_syn_login.assert_called_once()

def test_create_data_manifest_no_none(self, syn: Any):
df = process.create_data_manifest(parent="syn1111111", syn=syn)
df = process.create_data_manifest(syn=syn, parent="syn1111111")
self.patch_get_children.assert_called_once_with("syn1111111")
self.patch_syn_login.assert_not_called()
assert isinstance(df, pd.DataFrame)
Expand Down Expand Up @@ -202,21 +198,21 @@ def teardown_method(self):
mock.patch.stopall()

def test_process_all_files_config_path(self, syn: Any):
process.process_all_files(config_path="path/to/config", syn=syn)
process.process_all_files(syn=syn, config_path="path/to/config")
self.patch_get_config.assert_called_once_with(config_path="path/to/config")

def test_process_all_files_no_config_path(self, syn: Any):
process.process_all_files(config_path=None, syn=syn)
process.process_all_files(syn=syn, config_path=None)
self.patch_get_config.assert_called_once_with()

def test_process_all_files_process_dataset_fails(self, syn: Any):
with pytest.raises(ADTDataProcessingError):
self.patch_process_dataset.side_effect = Exception
process.process_all_files(config_path="path/to/config", syn=syn)
process.process_all_files(syn=syn, config_path="path/to/config")
self.patch_create_data_manifest.assert_not_called()

def test_process_all_files_full(self, syn: Any):
process.process_all_files(config_path=None, syn=syn)
process.process_all_files(syn=syn, config_path=None)
self.patch_process_dataset.assert_any_call(
dataset_obj={"a": {"b": "c"}}, staging_path="./staging", syn=syn
)
Expand Down

0 comments on commit 8112d98

Please sign in to comment.