Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IBCDPE-527] Makes syn always required #97

Merged
merged 6 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions src/agoradatatools/etl/extract.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from . import utils
import pandas as pd

import synapseclient

def get_entity_as_df(syn_id: str, source: str, syn=None) -> pd.DataFrame:

def get_entity_as_df(
syn_id: str, source: str, syn: synapseclient.Synapse
) -> pd.DataFrame:
"""
1. Creates and logs into synapseclient session (if not provided)
2. Gets synapse entity from id string (and version number if provided)
Expand All @@ -11,13 +14,11 @@ def get_entity_as_df(syn_id: str, source: str, syn=None) -> pd.DataFrame:
Args:
syn_id (str): Synapse ID of entity to be loaded to df
source (str): the source of the data to be loaded to df
syn (synapseclient.Synapse, optional): synapseclient.Synapse session. Defaults to None.
syn (synapseclient.Synapse): synapseclient.Synapse session object.
Returns:
pd.DataFrame: data frame generated from data source provided
"""
if syn is None:
syn = utils._login_to_synapse()

syn_id_version = syn_id.split(".")
synapse_id = syn_id_version[0]
Expand Down Expand Up @@ -92,7 +93,7 @@ def read_tsv_into_df(tsv_path: str) -> pd.DataFrame:
return pd.read_csv(tsv_path, sep="\t")


def read_table_into_df(table_id: str, syn) -> pd.DataFrame:
def read_table_into_df(table_id: str, syn: synapseclient.Synapse) -> pd.DataFrame:
"""
Reads a Synapse table into a dataframe.
Expand Down
10 changes: 2 additions & 8 deletions src/agoradatatools/etl/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import pandas as pd
from synapseclient import Activity, File, Synapse

from . import utils


class NumpyEncoder(json.JSONEncoder):
"""Special json encoder for numpy types"""
Expand Down Expand Up @@ -83,9 +81,7 @@ def remove_non_values(d: dict) -> dict:
return cleaned_dict


def load(
file_path: str, provenance: list, destination: str, syn: Synapse = None
) -> tuple:
def load(file_path: str, provenance: list, destination: str, syn: Synapse) -> tuple:
"""Reads file to be loaded into Synapse
:param syn: synapse object
:return: synapse id of the file loaded into Synapse. Returns None if it
Expand All @@ -95,14 +91,12 @@ def load(
file_path (str): Path of the file to be loaded into Synapse
provenance (list): Array of files that originate the one being loaded
destination (str): Location where the file should be loaded in Synapse
syn (synapseclient.Synapse, optional): synapseclient session. Defaults to None.
syn (synapseclient.Synapse): synapseclient session.
Returns:
tuple: Returns a tuple of the name fo the file and the version number.
"""

if syn is None:
syn = utils._login_to_synapse()
activity = Activity(used=provenance)
file = File(file_path, parent=destination)
file = syn.store(file, activity=activity, forceVersion=False)
Expand Down
23 changes: 11 additions & 12 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,14 @@ def process_dataset(
return syn_obj


def create_data_manifest(parent=None, syn=None) -> DataFrame:
def create_data_manifest(
syn: synapseclient.Synapse, parent: synapseclient.Folder = None
) -> DataFrame:
"""Creates data manifest (dataframe) that has the IDs and version numbers of child synapse folders
Args:
syn (synapseclient.Synapse): Synapse client session.
parent (synapseclient.Folder/str, optional): synapse folder or synapse id pointing to parent synapse folder. Defaults to None.
syn (synapseclient.Synapse, optional): Synapse client session. Defaults to None.
Returns:
DataFrame: Dataframe containing IDs and version numbers of folders within the parent directory
Expand All @@ -151,9 +153,6 @@ def create_data_manifest(parent=None, syn=None) -> DataFrame:
if not parent:
return None

if not syn:
syn = utils._login_to_synapse()

folders = syn.getChildren(parent)
folder = [folders]
folder = [
Expand All @@ -164,17 +163,17 @@ def create_data_manifest(parent=None, syn=None) -> DataFrame:


@log_time(func_name="process_all_files", logger=logger)
def process_all_files(config_path: str = None, syn=None):
def process_all_files(
syn: synapseclient.Synapse,
config_path: str = None,
):
"""This function will read through the entire configuration and process each file listed.
Args:
syn (synapseclient.Session): Synapse client session
config_path (str, optional): path to configuration file. Defaults to None.
syn (synapseclient.Session, optional): Synapse client session. Defaults to None.
"""

# if not syn:
# syn = utils._login_to_synapse()

if config_path:
config = utils._get_config(config_path=config_path)
else:
Expand Down Expand Up @@ -203,7 +202,7 @@ def process_all_files(config_path: str = None, syn=None):

if not error_list:
# create manifest if there are no errors
manifest_df = create_data_manifest(parent=destination, syn=syn)
manifest_df = create_data_manifest(syn=syn, parent=destination)
manifest_path = load.df_to_csv(
df=manifest_df, staging_path=staging_path, filename="data_manifest.csv"
)
Expand Down Expand Up @@ -240,7 +239,7 @@ def process(
auth_token: str = synapse_auth_opt,
):
syn = utils._login_to_synapse(token=auth_token)
process_all_files(config_path=config_path, syn=syn)
process_all_files(syn=syn, config_path=config_path)


if __name__ == "__main__":
Expand Down
12 changes: 1 addition & 11 deletions tests/load/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,7 @@ def setup_method(self, syn):
def teardown_method(self):
mock.patch.stopall()

def test_load_syn_is_none(self):
test_tuple = load.load(
file_path="fake/path/to/fake/file",
provenance=["syn1111111", "syn1111112"],
destination="syn1111113",
syn=None,
)
self.patch_syn_login.assert_called_once()
assert test_tuple == ("syn1111114", 1)

def test_load_syn_is_not_none(self, syn):
def test_load(self, syn):
test_tuple = load.load(
file_path="fake/path/to/fake/file",
provenance=["syn1111111", "syn1111112"],
Expand Down
11 changes: 1 addition & 10 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,6 @@ def test_read_json_into_df():
assert isinstance(df, pd.DataFrame)


# test if utils._login_to_synapse is called when syn=None
def test_get_entity_as_df_syn_is_none(syn):
with patch.object(
utils, "_login_to_synapse", return_value=syn
) as patch_login_to_synapse:
extract.get_entity_as_df(syn_id="syn11111111", source="table", syn=None)
patch_login_to_synapse.assert_called_once()


@pytest.mark.parametrize(
"syn_id, version", [("syn1111111", None), ("syn1111111.1", "1")]
)
Expand All @@ -113,7 +104,7 @@ def test_get_entity_as_df_with_version(syn, syn_id, version):


# test raise if is not supported
def test_get_entity_as_df__not_supported(syn):
def test_get_entity_as_df_not_supported(syn):
with pytest.raises(ValueError, match="File type not *"):
extract.get_entity_as_df(syn_id="syn1111111", source="abc", syn=syn)

Expand Down
16 changes: 6 additions & 10 deletions tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,11 @@ def teardown_method(self):
mock.patch.stopall()

def test_create_data_manifest_parent_none(self, syn: Any):
assert process.create_data_manifest(parent=None, syn=syn) is None
assert process.create_data_manifest(syn=syn, parent=None) is None
self.patch_syn_login.assert_not_called()

def test_create_data_manifest_syn_none(self):
process.create_data_manifest(parent="syn1111111", syn=None)
self.patch_syn_login.assert_called_once()

def test_create_data_manifest_no_none(self, syn: Any):
df = process.create_data_manifest(parent="syn1111111", syn=syn)
df = process.create_data_manifest(syn=syn, parent="syn1111111")
self.patch_get_children.assert_called_once_with("syn1111111")
self.patch_syn_login.assert_not_called()
assert isinstance(df, pd.DataFrame)
Expand Down Expand Up @@ -202,21 +198,21 @@ def teardown_method(self):
mock.patch.stopall()

def test_process_all_files_config_path(self, syn: Any):
process.process_all_files(config_path="path/to/config", syn=syn)
process.process_all_files(syn=syn, config_path="path/to/config")
self.patch_get_config.assert_called_once_with(config_path="path/to/config")

def test_process_all_files_no_config_path(self, syn: Any):
process.process_all_files(config_path=None, syn=syn)
process.process_all_files(syn=syn, config_path=None)
self.patch_get_config.assert_called_once_with()

def test_process_all_files_process_dataset_fails(self, syn: Any):
with pytest.raises(ADTDataProcessingError):
self.patch_process_dataset.side_effect = Exception
process.process_all_files(config_path="path/to/config", syn=syn)
process.process_all_files(syn=syn, config_path="path/to/config")
self.patch_create_data_manifest.assert_not_called()

def test_process_all_files_full(self, syn: Any):
process.process_all_files(config_path=None, syn=syn)
process.process_all_files(syn=syn, config_path=None)
self.patch_process_dataset.assert_any_call(
dataset_obj={"a": {"b": "c"}}, staging_path="./staging", syn=syn
)
Expand Down