Skip to content

Commit

Permalink
Merge pull request #1283 from Sage-Bionetworks/develop
Browse files Browse the repository at this point in the history
Schematic release 23.9.1
  • Loading branch information
andrewelamb authored Sep 5, 2023
2 parents d861295 + 1105499 commit 12d7707
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 63 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/api_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
if: ${{ inputs.perform_benchmarking }}
run: >
source .venv/bin/activate;
pytest -m "schematic_api"
pytest -m "schematic_api and not submission"
- name: Run API tests + Exclude Benchmarks
env:
Expand All @@ -85,4 +85,4 @@ jobs:
if: ${{ false == inputs.perform_benchmarking }}
run: >
source .venv/bin/activate;
pytest -m "schematic_api and not rule_benchmark"
pytest -m "schematic_api and not submission and not rule_benchmark"
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ markers = [
Google credentials (skipped on GitHub CI) \
""",
"""\
submission: tests that involve submitting manifests
""",
"""\
not_windows: tests that don't work on on windows machine
""",
"""\
Expand Down
61 changes: 28 additions & 33 deletions schematic/manifest/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def _gather_dependency_requirements(self, json_schema, required_metadata_fields)

return required_metadata_fields

def _add_root_to_component(self, required_metadata_fields):
def _add_root_to_component(self, required_metadata_fields: Dict[str, List]):
"""If 'Component' is in the column set, add root node as a
metadata component entry in the first row of that column.
Args:
Expand All @@ -462,7 +462,6 @@ def _add_root_to_component(self, required_metadata_fields):
)
else:
self.additional_metadata["Component"] = [self.root]

return

def _get_additional_metadata(self, required_metadata_fields: dict) -> dict:
Expand Down Expand Up @@ -886,7 +885,7 @@ def _request_note_valid_values(self, i, req, validation_rules, valid_values):
else:
return

def _request_notes_comments(self, i, req, json_schema):
def _set_required_columns_color(self, i, req, json_schema):
"""Update background colors so that columns that are required are highlighted
Args:
i (int): column index
Expand Down Expand Up @@ -1126,10 +1125,10 @@ def _create_requests_body(
if get_row_formatting:
requests_body["requests"].append(get_row_formatting)

# Add notes to headers to provide descriptions of the attribute
header_notes = self._request_notes_comments(i, req, json_schema)
if header_notes:
requests_body["requests"].append(header_notes)
# set color of required columns to blue
required_columns_color = self._set_required_columns_color(i, req, json_schema)
if required_columns_color:
requests_body["requests"].append(required_columns_color)
# Add note on how to use multi-select, when appropriate
note_vv = self._request_note_valid_values(
i, req, validation_rules, valid_values
Expand Down Expand Up @@ -1365,19 +1364,16 @@ def map_annotation_names_to_display_names(
return annotations.rename(columns=label_map)

def get_manifest_with_annotations(
self, annotations: pd.DataFrame, sheet_url:bool=None, strict: Optional[bool]=None,
self, annotations: pd.DataFrame, strict: Optional[bool]=None
) -> Tuple[ps.Spreadsheet, pd.DataFrame]:
"""Generate manifest, optionally with annotations (if requested).
Args:
annotations (pd.DataFrame): Annotations table (can be empty).
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
Returns:
Tuple[ps.Spreadsheet, pd.DataFrame]: Both the Google Sheet
URL and the corresponding data frame is returned.
"""

# Map annotation labels to display names to match manifest columns
annotations = self.map_annotation_names_to_display_names(annotations)

Expand All @@ -1391,19 +1387,19 @@ def get_manifest_with_annotations(
self.additional_metadata = annotations_dict

# Generate empty manifest using `additional_metadata`
manifest_url = self.get_empty_manifest(sheet_url=sheet_url, strict=strict)
# With annotations added, regenerate empty manifest
manifest_url = self.get_empty_manifest(sheet_url=True, strict=strict)
manifest_df = self.get_dataframe_by_url(manifest_url=manifest_url)

# Annotations clashing with manifest attributes are skipped
# during empty manifest generation. For more info, search
# for `additional_metadata` in `self.get_empty_manifest`.
# Hence, the shared columns need to be updated separately.
if self.is_file_based and self.use_annotations:
# This approach assumes that `update_df` returns
# a data frame whose columns are in the same order
manifest_df = update_df(manifest_df, annotations)
manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
manifest_url = manifest_sh.url
# This approach assumes that `update_df` returns
# a data frame whose columns are in the same order
manifest_df = update_df(manifest_df, annotations)
manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
manifest_url = manifest_sh.url

return manifest_url, manifest_df

Expand Down Expand Up @@ -1527,7 +1523,7 @@ def get_manifest(
manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)

# get URL of an empty manifest file created based on schema component
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=sheet_url)
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True)

# Populate empty template with existing manifest
if manifest_record:
Expand All @@ -1547,25 +1543,24 @@ def get_manifest(
return result

# Generate empty template and optionally fill in with annotations
# if there is no existing manifest and use annotations is set to True,
# pull annotations (in reality, annotations should be empty when there is no existing manifest)
else:
# Using getDatasetAnnotations() to retrieve file names and subset
# entities to files and folders (ignoring tables/views)

annotations = pd.DataFrame()
if self.is_file_based:
annotations = store.getDatasetAnnotations(dataset_id)

# if there are no files with annotations just generate an empty manifest
if annotations.empty:
manifest_url = self.get_empty_manifest(strict=strict)
manifest_df = self.get_dataframe_by_url(manifest_url)
if self.use_annotations:
if self.is_file_based:
annotations = store.getDatasetAnnotations(dataset_id)
# Update `additional_metadata` and generate manifest
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations,strict=strict)
else:
# Subset columns if no interested in user-defined annotations and there are files present
if self.is_file_based and not self.use_annotations:
annotations = annotations[["Filename", "eTag", "entityId"]]

# Update `additional_metadata` and generate manifest
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations, sheet_url=sheet_url, strict=strict)
empty_manifest_df = self.get_dataframe_by_url(empty_manifest_url)
if self.is_file_based:
# for file-based manifest, make sure that entityId column and Filename column still gets filled even though use_annotations gets set to False
manifest_df = store.add_entity_id_and_filename(dataset_id,empty_manifest_df)
else:
manifest_df = empty_manifest_df

# Update df with existing manifest. Agnostic to output format
updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_df)
Expand Down
110 changes: 93 additions & 17 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import shutil

# allows specifying explicit variable types
from typing import Dict, List, Tuple, Sequence, Union
from typing import Dict, List, Tuple, Sequence, Union, Optional
from collections import OrderedDict
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed, retry_if_exception_type

Expand All @@ -35,7 +35,7 @@
from synapseclient.table import CsvFileTable, build_table, Schema
from synapseclient.annotations import from_synapse_annotations
from synapseclient.core.exceptions import SynapseHTTPError, SynapseAuthenticationError, SynapseUnmetAccessRestrictions
from synapseutils import walk
import synapseutils
from synapseutils.copy_functions import changeFileMetaData

import uuid
Expand Down Expand Up @@ -413,7 +413,7 @@ def getFilesInStorageDataset(
"""

# select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset')
walked_path = walk(self.syn, datasetId)
walked_path = synapseutils.walk(self.syn, datasetId, includeTypes=["folder", "file"])

file_list = []

Expand Down Expand Up @@ -538,6 +538,94 @@ def getDataTypeFromManifest(self, manifestId:str):

return result_dict

def _get_files_metadata_from_dataset(self, datasetId: str, only_new_files: bool, manifest:pd.DataFrame=None) -> Optional[dict]:
"""retrieve file ids under a particular datasetId
Args:
datasetId (str): a dataset id
only_new_files (bool): if only adding new files that are not already exist
manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
Returns:
a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
"""
dataset_files = self.getFilesInStorageDataset(datasetId)
if dataset_files:
dataset_file_names_id_dict = self._get_file_entityIds(dataset_files, only_new_files=only_new_files, manifest=manifest)
return dataset_file_names_id_dict
else:
return None

def add_entity_id_and_filename(self, datasetId: str, manifest: pd.DataFrame) -> pd.DataFrame:
"""add entityid and filename column to an existing manifest assuming entityId column is not already present
Args:
datasetId (str): dataset syn id
manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:
pd.DataFrame: returns a pandas dataframe
"""
# get file names and entity ids of a given dataset
dataset_files_dict = self._get_files_metadata_from_dataset(datasetId, only_new_files=False)

if dataset_files_dict:
# turn manifest dataframe back to a dictionary for operation
manifest_dict = manifest.to_dict('list')

# update Filename column
# add entityId column to the end
manifest_dict.update(dataset_files_dict)

# if the component column exists in existing manifest, fill up that column
if "Component" in manifest_dict.keys():
manifest_dict["Component"] = manifest_dict["Component"] * max(1, len(manifest_dict["Filename"]))

# turn dictionary back to a dataframe
manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient='index')
manifest_df_updated = manifest_df_index.transpose()

# fill na with empty string
manifest_df_updated = manifest_df_updated.fillna("")

# drop index
manifest_df_updated = manifest_df_updated.reset_index(drop=True)

return manifest_df_updated
else:
return manifest

def fill_in_entity_id_filename(self, datasetId: str, manifest: pd.DataFrame) -> Tuple[List, pd.DataFrame]:
"""fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
Args:
datasetId (str): dataset syn id
manifest (pd.DataFrame): existing manifest dataframe.
Returns:
Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
"""
# get dataset file names and entity id as a list of tuple
dataset_files = self.getFilesInStorageDataset(datasetId)

# update manifest with additional filenames, if any
# note that if there is an existing manifest and there are files in the dataset
# the columns Filename and entityId are assumed to be present in manifest schema
# TODO: use idiomatic panda syntax
if dataset_files:
new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)

# update manifest so that it contains new dataset files
new_files = pd.DataFrame(new_files)
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)

manifest = manifest.fillna("")
return dataset_files, manifest

def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:bool = True) -> Union[Tuple[str, pd.DataFrame], None]:
"""Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
Expand All @@ -562,32 +650,20 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:
manifest_filepath = self.syn.get(manifest_id).path
manifest = load_df(manifest_filepath)

# get current list of files
dataset_files = self.getFilesInStorageDataset(datasetId)

# update manifest with additional filenames, if any
# note that if there is an existing manifest and there are files in the dataset
# the columns Filename and entityId are assumed to be present in manifest schema
# TODO: use idiomatic panda syntax
if dataset_files:
new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)

# update manifest so that it contain new files
new_files = pd.DataFrame(new_files)
manifest = (
pd.concat([manifest, new_files], sort=False)
.reset_index()
.drop("index", axis=1)
)

dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest)
if dataset_files:
# update the manifest file, so that it contains the relevant entity IDs
if store:
manifest.to_csv(manifest_filepath, index=False)

# store manifest and update associated metadata with manifest on Synapse
manifest_id = self.associateMetadataWithFiles(sg, manifest_filepath, datasetId)

manifest = manifest.fillna("")

return manifest_id, manifest

Expand Down
Loading

0 comments on commit 12d7707

Please sign in to comment.