Merge pull request #1283 from Sage-Bionetworks/develop

Schematic release 23.9.1
Sage-Bionetworks · Sep 5, 2023 · 12d7707 · 12d7707
2 parents d861295 + 1105499
commit 12d7707
Show file tree

Hide file tree

Showing 7 changed files with 286 additions and 63 deletions.
diff --git a/.github/workflows/api_test.yml b/.github/workflows/api_test.yml
@@ -76,7 +76,7 @@ jobs:
         if: ${{ inputs.perform_benchmarking }}
         run: >
           source .venv/bin/activate;
-          pytest -m "schematic_api"
+          pytest -m "schematic_api and not submission"
 
       - name: Run API tests + Exclude Benchmarks
         env:
@@ -85,4 +85,4 @@ jobs:
         if: ${{ false == inputs.perform_benchmarking }}
         run: >
           source .venv/bin/activate;
-          pytest -m "schematic_api and not rule_benchmark"
+          pytest -m "schematic_api and not submission and not rule_benchmark"
diff --git a/pyproject.toml b/pyproject.toml
@@ -126,6 +126,9 @@ markers = [
     Google credentials (skipped on GitHub CI) \
     """,
     """\
+    submission: tests that involve submitting manifests
+    """,
+    """\
     not_windows: tests that don't work on on windows machine
     """,
     """\

diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py
@@ -439,7 +439,7 @@ def _gather_dependency_requirements(self, json_schema, required_metadata_fields)
 
         return required_metadata_fields
 
-    def _add_root_to_component(self, required_metadata_fields):
+    def _add_root_to_component(self, required_metadata_fields: Dict[str, List]):
         """If 'Component' is in the column set, add root node as a
         metadata component entry in the first row of that column.
         Args:
@@ -462,7 +462,6 @@ def _add_root_to_component(self, required_metadata_fields):
                 )
             else:
                 self.additional_metadata["Component"] = [self.root]
-
         return
 
     def _get_additional_metadata(self, required_metadata_fields: dict) -> dict:
@@ -886,7 +885,7 @@ def _request_note_valid_values(self, i, req, validation_rules, valid_values):
         else:
             return
 
-    def _request_notes_comments(self, i, req, json_schema):
+    def _set_required_columns_color(self, i, req, json_schema):
         """Update background colors so that columns that are required are highlighted
         Args:
             i (int): column index
@@ -1126,10 +1125,10 @@ def _create_requests_body(
             if get_row_formatting:
                 requests_body["requests"].append(get_row_formatting)
 
-            # Add notes to headers to provide descriptions of the attribute
-            header_notes = self._request_notes_comments(i, req, json_schema)
-            if header_notes:
-                requests_body["requests"].append(header_notes)
+            # set color of required columns to blue
+            required_columns_color = self._set_required_columns_color(i, req, json_schema)
+            if required_columns_color:
+                requests_body["requests"].append(required_columns_color)
             # Add note on how to use multi-select, when appropriate
             note_vv = self._request_note_valid_values(
                 i, req, validation_rules, valid_values
@@ -1365,19 +1364,16 @@ def map_annotation_names_to_display_names(
         return annotations.rename(columns=label_map)
 
     def get_manifest_with_annotations(
-        self, annotations: pd.DataFrame, sheet_url:bool=None, strict: Optional[bool]=None,
+        self, annotations: pd.DataFrame, strict: Optional[bool]=None
     ) -> Tuple[ps.Spreadsheet, pd.DataFrame]:
         """Generate manifest, optionally with annotations (if requested).
-
         Args:
             annotations (pd.DataFrame): Annotations table (can be empty).
             strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
-            sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
         Returns:
             Tuple[ps.Spreadsheet, pd.DataFrame]: Both the Google Sheet
             URL and the corresponding data frame is returned.
         """
-
         # Map annotation labels to display names to match manifest columns
         annotations = self.map_annotation_names_to_display_names(annotations)
 
@@ -1391,19 +1387,19 @@ def get_manifest_with_annotations(
         self.additional_metadata = annotations_dict
 
         # Generate empty manifest using `additional_metadata`
-        manifest_url = self.get_empty_manifest(sheet_url=sheet_url, strict=strict)
+        # With annotations added, regenerate empty manifest
+        manifest_url = self.get_empty_manifest(sheet_url=True, strict=strict)
         manifest_df = self.get_dataframe_by_url(manifest_url=manifest_url)
 
         # Annotations clashing with manifest attributes are skipped
         # during empty manifest generation. For more info, search
         # for `additional_metadata` in `self.get_empty_manifest`.
         # Hence, the shared columns need to be updated separately.
-        if self.is_file_based and self.use_annotations:
-            # This approach assumes that `update_df` returns
-            # a data frame whose columns are in the same order
-            manifest_df = update_df(manifest_df, annotations)
-            manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
-            manifest_url = manifest_sh.url
+        # This approach assumes that `update_df` returns
+        # a data frame whose columns are in the same order
+        manifest_df = update_df(manifest_df, annotations)
+        manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
+        manifest_url = manifest_sh.url
 
         return manifest_url, manifest_df
 
@@ -1527,7 +1523,7 @@ def get_manifest(
         manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)
 
         # get URL of an empty manifest file created based on schema component
-        empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=sheet_url)
+        empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True)
 
         # Populate empty template with existing manifest
         if manifest_record:
@@ -1547,25 +1543,24 @@ def get_manifest(
             return result
 
         # Generate empty template and optionally fill in with annotations
+        # if there is no existing manifest and use annotations is set to True, 
+        # pull annotations (in reality, annotations should be empty when there is no existing manifest)
         else:
             # Using getDatasetAnnotations() to retrieve file names and subset
             # entities to files and folders (ignoring tables/views)
-
             annotations = pd.DataFrame()
-            if self.is_file_based:
-                annotations = store.getDatasetAnnotations(dataset_id)
-
-            # if there are no files with annotations just generate an empty manifest
-            if annotations.empty:
-                manifest_url = self.get_empty_manifest(strict=strict)
-                manifest_df = self.get_dataframe_by_url(manifest_url)
+            if self.use_annotations:
+                if self.is_file_based:
+                    annotations = store.getDatasetAnnotations(dataset_id)
+                    # Update `additional_metadata` and generate manifest
+                    manifest_url, manifest_df = self.get_manifest_with_annotations(annotations,strict=strict)
             else:
-                # Subset columns if no interested in user-defined annotations and there are files present
-                if self.is_file_based and not self.use_annotations:
-                    annotations = annotations[["Filename", "eTag", "entityId"]]
-
-                # Update `additional_metadata` and generate manifest
-                manifest_url, manifest_df = self.get_manifest_with_annotations(annotations, sheet_url=sheet_url, strict=strict)
+                empty_manifest_df = self.get_dataframe_by_url(empty_manifest_url)
+                if self.is_file_based:
+                    # for file-based manifest, make sure that entityId column and Filename column still gets filled even though use_annotations gets set to False
+                    manifest_df = store.add_entity_id_and_filename(dataset_id,empty_manifest_df)
+                else:
+                    manifest_df = empty_manifest_df
 
             # Update df with existing manifest. Agnostic to output format
             updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_df)

diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
@@ -10,7 +10,7 @@
 import shutil
 
 # allows specifying explicit variable types
-from typing import Dict, List, Tuple, Sequence, Union
+from typing import Dict, List, Tuple, Sequence, Union, Optional
 from collections import OrderedDict
 from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed, retry_if_exception_type
 
@@ -35,7 +35,7 @@
 from synapseclient.table import CsvFileTable, build_table, Schema
 from synapseclient.annotations import from_synapse_annotations
 from synapseclient.core.exceptions import SynapseHTTPError, SynapseAuthenticationError, SynapseUnmetAccessRestrictions
-from synapseutils import walk
+import synapseutils
 from synapseutils.copy_functions import changeFileMetaData
 
 import uuid
@@ -413,7 +413,7 @@ def getFilesInStorageDataset(
         """
 
         # select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset')
-        walked_path = walk(self.syn, datasetId)
+        walked_path = synapseutils.walk(self.syn, datasetId, includeTypes=["folder", "file"])
 
         file_list = []
 
@@ -538,6 +538,94 @@ def getDataTypeFromManifest(self, manifestId:str):
 
         return result_dict
 
+    def _get_files_metadata_from_dataset(self, datasetId: str, only_new_files: bool, manifest:pd.DataFrame=None) -> Optional[dict]:
+        """retrieve file ids under a particular datasetId
+
+        Args:
+            datasetId (str): a dataset id 
+            only_new_files (bool): if only adding new files that are not already exist 
+            manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 
+
+        Returns:
+            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
+        """
+        dataset_files = self.getFilesInStorageDataset(datasetId)
+        if dataset_files:
+            dataset_file_names_id_dict = self._get_file_entityIds(dataset_files, only_new_files=only_new_files, manifest=manifest)
+            return dataset_file_names_id_dict
+        else:
+            return None
+
+    def add_entity_id_and_filename(self,  datasetId: str, manifest: pd.DataFrame) -> pd.DataFrame:
+        """add entityid and filename column to an existing manifest assuming entityId column is not already present
+
+        Args:
+            datasetId (str): dataset syn id
+            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 
+
+        Returns:
+            pd.DataFrame: returns a pandas dataframe 
+        """
+        # get file names and entity ids of a given dataset 
+        dataset_files_dict = self._get_files_metadata_from_dataset(datasetId, only_new_files=False)
+
+        if dataset_files_dict: 
+            # turn manifest dataframe back to a dictionary for operation 
+            manifest_dict = manifest.to_dict('list')
+
+            # update Filename column
+            # add entityId column to the end
+            manifest_dict.update(dataset_files_dict)
+
+            # if the component column exists in existing manifest, fill up that column 
+            if "Component" in manifest_dict.keys():
+                manifest_dict["Component"] = manifest_dict["Component"] * max(1, len(manifest_dict["Filename"]))
+
+            # turn dictionary back to a dataframe
+            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient='index')
+            manifest_df_updated = manifest_df_index.transpose()
+
+            # fill na with empty string
+            manifest_df_updated = manifest_df_updated.fillna("")
+
+            # drop index
+            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
+
+            return manifest_df_updated
+        else:
+            return manifest
+
+    def fill_in_entity_id_filename(self, datasetId: str, manifest: pd.DataFrame) -> Tuple[List, pd.DataFrame]:
+        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 
+
+        Args:
+            datasetId (str): dataset syn id
+            manifest (pd.DataFrame): existing manifest dataframe.
+
+        Returns:
+            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
+        """
+        # get dataset file names and entity id as a list of tuple
+        dataset_files = self.getFilesInStorageDataset(datasetId)
+
+        # update manifest with additional filenames, if any
+        # note that if there is an existing manifest and there are files in the dataset
+        # the columns Filename and entityId are assumed to be present in manifest schema
+        # TODO: use idiomatic panda syntax
+        if dataset_files:
+            new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)
+
+            # update manifest so that it contains new dataset files
+            new_files = pd.DataFrame(new_files)
+            manifest = (
+                    pd.concat([manifest, new_files], sort=False)
+                    .reset_index()
+                    .drop("index", axis=1)
+            )
+
+        manifest = manifest.fillna("") 
+        return dataset_files, manifest
+
     def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:bool = True) -> Union[Tuple[str, pd.DataFrame], None]:
         """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
 
@@ -562,32 +650,20 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:
         manifest_filepath = self.syn.get(manifest_id).path
         manifest = load_df(manifest_filepath)
 
-        # get current list of files
-        dataset_files = self.getFilesInStorageDataset(datasetId)
-
         # update manifest with additional filenames, if any
         # note that if there is an existing manifest and there are files in the dataset
         # the columns Filename and entityId are assumed to be present in manifest schema
         # TODO: use idiomatic panda syntax
-        if dataset_files:
-            new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)
-
-            # update manifest so that it contain new files
-            new_files = pd.DataFrame(new_files)
-            manifest = (
-                pd.concat([manifest, new_files], sort=False)
-                .reset_index()
-                .drop("index", axis=1)
-            )
 
+        dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest)
+        if dataset_files:
             # update the manifest file, so that it contains the relevant entity IDs
             if store:
                 manifest.to_csv(manifest_filepath, index=False)
 
                 # store manifest and update associated metadata with manifest on Synapse
                 manifest_id = self.associateMetadataWithFiles(sg, manifest_filepath, datasetId)
 
-        manifest = manifest.fillna("") 
 
         return manifest_id, manifest
-Original file line number
+Diff line change
@@ Expand Up / @@ -126,6 +126,9 @@ markers = [ @@
         Google credentials (skipped on GitHub CI) \
         """,
         """\
+        submission: tests that involve submitting manifests
+        """,
+        """\
         not_windows: tests that don't work on on windows machine
         """,
         """\
@@ Expand Down @@