diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 0abe07323..ec53a0197 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1,5 +1,5 @@ import os -import uuid # used to generate unique names for entities +import uuid # used to generate unique names for entities import json import atexit import logging @@ -14,9 +14,8 @@ import synapseclient import synapseutils -from synapseclient import ( - Synapse, File, Folder, EntityViewSchema, EntityViewType, Column -) +from synapseclient import (Synapse, File, Folder, EntityViewSchema, + EntityViewType, Column) from synapseclient.table import CsvFileTable from synapseclient.annotations import from_synapse_annotations from synapseclient.core.exceptions import SynapseHTTPError, SynapseAuthenticationError @@ -34,32 +33,26 @@ class SynapseStorage(BaseStorage): """Implementation of Storage interface for datasets/files stored on Synapse. - Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. TODO: Need to define the interface and rename and/or refactor some of the methods below. """ - - def __init__(self, - token: str = None, # optional parameter retrieved from browser cookie - access_token: str = None, - ) -> None: - + def __init__( + self, + token: str = None, # optional parameter retrieved from browser cookie + access_token: str = None, + ) -> None: """Initializes a SynapseStorage object. - Args: syn: an object of type synapseclient. token: optional token parameter (typically a 'str') as found in browser cookie upon login to synapse. access_token: optional access token (personal or oauth) - TODO: move away from specific project setup and work with an interface that Synapse specifies (e.g. based on schemas). - Exceptions: KeyError: when the 'storage' config object is missing values for essential keys. AttributeError: when the 'storageFileview' attribute (of class SynapseStorage) does not have a value associated with it. synapseclient.core.exceptions.SynapseHTTPError: check if the current user has permission to access the Synapse entity. ValueError: when Admin fileview cannot be found (describe further). - Typical usage example: syn_store = SynapseStorage() """ @@ -73,22 +66,26 @@ def __init__(self, self.syn = synapseclient.Synapse() try: - self.syn.login(sessionToken = token, silent = True) + self.syn.login(sessionToken=token, silent=True) except synapseclient.core.exceptions.SynapseHTTPError: - raise ValueError("Please make sure you are logged into synapse.org.") + raise ValueError( + "Please make sure you are logged into synapse.org.") elif access_token: self.syn = synapseclient.Synapse() - self.syn.default_headers["Authorization"] = f"Bearer {access_token}" + self.syn.default_headers[ + "Authorization"] = f"Bearer {access_token}" else: # login using synapse credentials provided by user in .synapseConfig (default) file - self.syn = synapseclient.Synapse(configPath=CONFIG.SYNAPSE_CONFIG_PATH) - self.syn.login(silent = True) + self.syn = synapseclient.Synapse( + configPath=CONFIG.SYNAPSE_CONFIG_PATH) + self.syn.login(silent=True) try: self.storageFileview = CONFIG["synapse"]["master_fileview"] # get data in administrative fileview for this pipeline - self.storageFileviewTable = self.syn.tableQuery("SELECT * FROM " + self.storageFileview).asDataFrame() + self.storageFileviewTable = self.syn.tableQuery( + "SELECT * FROM " + self.storageFileview).asDataFrame() self.manifest = CONFIG["synapse"]["manifest_filename"] except KeyError: @@ -101,32 +98,36 @@ def __init__(self, raise MissingConfigValueError(("synapse", "master_fileview")) - def getPaginatedRestResults(self, currentUserId : str) -> Dict[str, str]: + def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. Args: currentUserId: synapse id for the user whose projects we want to get. - + Returns: A dictionary with a next page token and the results. """ - all_results = self.syn.restGET('/projects/user/{principalId}'.format(principalId=currentUserId)) - - while 'nextPageToken' in all_results: # iterate over next page token in results while there is any - results_token = self.syn.restGET('/projects/user/{principalId}?nextPageToken={nextPageToken}'.format(principalId=currentUserId, nextPageToken = all_results['nextPageToken'])) + all_results = self.syn.restGET( + '/projects/user/{principalId}'.format(principalId=currentUserId)) + + while 'nextPageToken' in all_results: # iterate over next page token in results while there is any + results_token = self.syn.restGET( + '/projects/user/{principalId}?nextPageToken={nextPageToken}'. + format(principalId=currentUserId, + nextPageToken=all_results['nextPageToken'])) all_results['results'].extend(results_token['results']) if 'nextPageToken' in results_token: all_results['nextPageToken'] = results_token['nextPageToken'] else: - del(all_results['nextPageToken']) + del (all_results['nextPageToken']) return all_results def getStorageProjects(self) -> List[str]: """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. - + Returns: A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). """ @@ -145,7 +146,10 @@ def getStorageProjects(self) -> List[str]: currentUserProjects = self.getPaginatedRestResults(currentUserId) # prune results json filtering project id - currentUserProjects = [currentUserProject.get('id') for currentUserProject in currentUserProjects["results"]] + currentUserProjects = [ + currentUserProject.get('id') + for currentUserProject in currentUserProjects["results"] + ] # find set of user projects that are also in this pipeline's storage projects set storageProjects = list(set(storageProjects) & set(currentUserProjects)) @@ -153,7 +157,7 @@ def getStorageProjects(self) -> List[str]: # prepare a return list of project IDs and names projects = [] for projectId in storageProjects: - projectName = self.syn.get(projectId, downloadFile = False).name + projectName = self.syn.get(projectId, downloadFile=False).name projects.append((projectId, projectName)) sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) @@ -163,10 +167,10 @@ def getStorageProjects(self) -> List[str]: def getStorageDatasetsInProject(self, projectId: str) -> List[str]: """Gets all datasets in folder under a given storage project that the current user has access to. - + Args: projectId: synapse ID of a storage project. - + Returns: A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse. @@ -176,10 +180,14 @@ def getStorageDatasetsInProject(self, projectId: str) -> List[str]: # if folder content type is defined, only select folders that contain datasets areDatasets = False if "contentType" in self.storageFileviewTable.columns: - foldersTable = self.storageFileviewTable[(self.storageFileviewTable["contentType"] == "dataset") & (self.storageFileviewTable["projectId"] == projectId)] + foldersTable = self.storageFileviewTable[ + (self.storageFileviewTable["contentType"] == "dataset") + & (self.storageFileviewTable["projectId"] == projectId)] areDatasets = True else: - foldersTable = self.storageFileviewTable[(self.storageFileviewTable["type"] == "folder") & (self.storageFileviewTable["parentId"] == projectId)] + foldersTable = self.storageFileviewTable[ + (self.storageFileviewTable["type"] == "folder") + & (self.storageFileviewTable["parentId"] == projectId)] # get an array of tuples (folderId, folderName) # some folders are part of datasets; others contain datasets @@ -190,7 +198,8 @@ def getStorageDatasetsInProject(self, projectId: str) -> List[str]: datasetList = [] folderProperties = ["id", "name"] - for folder in list(foldersTable[folderProperties].itertuples(index = False, name = None)): + for folder in list(foldersTable[folderProperties].itertuples( + index=False, name=None)): datasetList.append(folder) sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) @@ -198,7 +207,11 @@ def getStorageDatasetsInProject(self, projectId: str) -> List[str]: return sorted_dataset_list - def getFilesInStorageDataset(self, datasetId: str, fileNames: List = None, fullpath:bool = True) -> List[Tuple[str, str]]: + def getFilesInStorageDataset( + self, + datasetId: str, + fileNames: List = None, + fullpath: bool = True) -> List[Tuple[str, str]]: """Gets all files in a given dataset folder. Args: @@ -207,8 +220,9 @@ def getFilesInStorageDataset(self, datasetId: str, fileNames: List = None, fullp metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. fullpath: if True return the full path as part of this filename; otherwise return just base filename - Returns: a list of files; the list consists of tuples (fileId, fileName). - + Returns: + A list of files; the list consists of tuples (fileId, fileName). + Raises: ValueError: Dataset ID not found. """ @@ -224,16 +238,18 @@ def getFilesInStorageDataset(self, datasetId: str, fileNames: List = None, fullp # iterate over all files in a folder for filename in filenames: - if (not "manifest" in filename[0] and not fileNames) or (not fileNames == None and filename[0] in fileNames): + if (not "manifest" in filename[0] + and not fileNames) or (not fileNames == None + and filename[0] in fileNames): # don't add manifest to list of files unless it is specified in the list of specified fileNames; return all found files # except the manifest if no fileNames have been specified # TODO: refactor for clarity/maintainability - if fullpath: # append directory path to filename - filename = (dirpath[0] + "/" + filename[0], filename[1]) + filename = (dirpath[0] + "/" + filename[0], + filename[1]) # add file name file id tuple, rearranged so that id is first and name follows file_list.append(filename[::-1]) @@ -241,7 +257,9 @@ def getFilesInStorageDataset(self, datasetId: str, fileNames: List = None, fullp return file_list - def getDatasetManifest(self, datasetId: str, downloadFile: bool = False) -> List[str]: + def getDatasetManifest(self, + datasetId: str, + downloadFile: bool = False) -> List[str]: """Gets the manifest associated with a given dataset. Args: @@ -249,18 +267,22 @@ def getDatasetManifest(self, datasetId: str, downloadFile: bool = False) -> List downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. Returns: - A tuple of manifest file ID and manifest name -- (fileId, fileName); returns empty list if no manifest is found. - (or) - synapseclient.entity.File: A new Synapse Entity object of the appropriate type, if downloadFile is set to True + manifest_syn_id (String): Synapse ID of exisiting manifest file. + manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. + "" (String): No pre-exisiting manifest in dataset. """ # get a list of files containing the manifest for this dataset (if any) all_files = self.storageFileviewTable - manifest = all_files[(all_files["name"] == os.path.basename(self.manifest)) & (all_files["parentId"] == datasetId)] + manifest = all_files[ + (all_files["name"] == os.path.basename(self.manifest)) + & (all_files["parentId"] == datasetId)] manifest = manifest[['id', 'name']] + # if there is no pre-exisiting manifest in the specified dataset if manifest.empty: - return [] + return "" + # if there is an exisiting manifest else: # if the downloadFile option is set to True if downloadFile: @@ -268,11 +290,17 @@ def getDatasetManifest(self, datasetId: str, downloadFile: bool = False) -> List manifest_syn_id = manifest['id'][0] # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location - manifest_data = self.syn.get(manifest_syn_id, downloadLocation=CONFIG["synapse"]["manifest_folder"], ifcollision="overwrite.local") + manifest_data = self.syn.get( + manifest_syn_id, + downloadLocation=CONFIG["synapse"]["manifest_folder"], + ifcollision="overwrite.local") return manifest_data - return list(manifest.to_records(index=False))[0] # extract manifest tuple from list + # extract synapse ID of exisiting dataset manifest + manifest_syn_id = manifest.to_records(index=False)[0][0] + + return manifest_syn_id def updateDatasetManifestFiles(self, datasetId: str) -> str: @@ -281,17 +309,18 @@ def updateDatasetManifestFiles(self, datasetId: str) -> str: Args: datasetId: synapse ID of a storage dataset. - Returns: synapse ID of updated manifest. + Returns: + Synapse ID of updated manifest. """ # get existing manifest Synapse ID - manifest_id_name = self.getDatasetManifest(datasetId) - if not manifest_id_name: + manifest_id = self.getDatasetManifest(datasetId) + if not manifest_id: # no manifest exists yet: abort - raise FileNotFoundError(f"Manifest file {CONFIG['synapse']['manifest_filename']} " - f"cannot be found in {datasetId} dataset folder.") + raise FileNotFoundError( + f"Manifest file {CONFIG['synapse']['manifest_filename']} " + f"cannot be found in {datasetId} dataset folder.") - manifest_id = manifest_id_name[0] manifest_filepath = self.syn.get(manifest_id).path manifest = pd.read_csv(manifest_filepath) @@ -303,10 +332,7 @@ def updateDatasetManifestFiles(self, datasetId: str) -> str: # the columns Filename and entityId are assumed to be present in manifest schema # TODO: use idiomatic panda syntax if dataset_files: - new_files = { - "Filename": [], - "entityId": [] - } + new_files = {"Filename": [], "entityId": []} # find new files if any for file_id, file_name in dataset_files: @@ -317,12 +343,15 @@ def updateDatasetManifestFiles(self, datasetId: str) -> str: # update manifest so that it contain new files #manifest = pd.DataFrame(new_files) new_files = pd.DataFrame(new_files) - manifest = pd.concat([new_files, manifest], sort = False).reset_index().drop("index", axis = 1) + manifest = pd.concat([new_files, manifest], + sort=False).reset_index().drop("index", + axis=1) # update the manifest file, so that it contains the relevant entity IDs - manifest.to_csv(manifest_filepath, index = False) + manifest.to_csv(manifest_filepath, index=False) # store manifest and update associated metadata with manifest on Synapse - manifest_id = self.associateMetadataWithFiles(manifest_filepath, datasetId) + manifest_id = self.associateMetadataWithFiles( + manifest_filepath, datasetId) return manifest_id @@ -330,7 +359,7 @@ def updateDatasetManifestFiles(self, datasetId: str) -> str: def getAllManifests(self) -> List[str]: """Gets all metadata manifest files across all datasets in projects a user has access to. - Returns: a list of projects, datasets per project and metadata manifest Synapse ID for each dataset + Returns: A list of projects, datasets per project and metadata manifest Synapse ID for each dataset as a list of tuples, one for each manifest: [ ( @@ -341,8 +370,10 @@ def getAllManifests(self) -> List[str]: ... ] - TODO: return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface - TODO: use fileview instead of iterating through projects and datasets + TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface + TODO: Use fileview instead of iterating through projects and datasets + TODO: GetDatasetManifest() return type has changed to return only manifest synapse ID. Fetch + manifestName from config.yml and create tuple """ projects = self.getStorageProjects() @@ -356,32 +387,30 @@ def getAllManifests(self) -> List[str]: # encode information about the manifest in a simple list (so that R clients can unpack it) # eventually can serialize differently - manifest = ( - (projectId, projectName), - (datasetId, datasetName), - self.getDatasetManifest(datasetId) - ) + manifest = ((projectId, projectName), (datasetId, datasetName), + self.getDatasetManifest(datasetId)) manifests.append(manifest) return manifests - def get_synapse_table(self, synapse_id:str) -> Tuple[pd.DataFrame, CsvFileTable]: - """ - Download synapse table as a pd dataframe; return table schema and etags as results too - + def get_synapse_table( + self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: + """Download synapse table as a pd dataframe; return table schema and etags as results too + Args: synapse_id: synapse ID of the table to query """ results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex = False) + df = results.asDataFrame(rowIdAndVersionInIndex=False) return df, results - def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) -> str: + def associateMetadataWithFiles(self, metadataManifestPath: str, + datasetId: str) -> str: """Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. @@ -408,19 +437,9 @@ def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) try: manifest = pd.read_csv(metadataManifestPath) except FileNotFoundError as err: - raise FileNotFoundError(f"No manifest file was found at this path: {metadataManifestPath}") from err - - # check if there is an existing manifest - existingManifest = self.getDatasetManifest(datasetId) - - if existingManifest: - - # update the existing manifest, so that existing entities get updated metadata and new entities are preserved; - # note that an existing manifest always contains an entityId column, which is assumed to be the index key - # if updating an existing manifest the new manifest should also contain an entityId column - # (it is ok if the entities ID in the new manifest are blank) - manifest['entityId'].fillna('', inplace = True) - manifest = update_df(manifest, existingManifest, "entityId") + raise FileNotFoundError( + f"No manifest file was found at this path: {metadataManifestPath}" + ) from err # if this is a new manifest there could be no Synapse entities associated with the rows of this manifest # this may be due to data type (e.g. clinical data) being tabular @@ -432,6 +451,8 @@ def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) if not "entityId" in manifest.columns: manifest["entityId"] = "" + else: + manifest['entityId'].fillna('', inplace=True) # get a schema explorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations se = SchemaExplorer() @@ -440,22 +461,30 @@ def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) # also set metadata for each synapse entity as Synapse annotations for idx, row in manifest.iterrows(): if not row["entityId"]: - # no entity exists for this row - # so create one - rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) - rowEntity = self.syn.store(rowEntity) - entityId = rowEntity["id"] - row["entityId"] = entityId - manifest.loc[idx, "entityId"] = entityId + # no entity exists for this row + # so create one + rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) + rowEntity = self.syn.store(rowEntity) + entityId = rowEntity["id"] + row["entityId"] = entityId + manifest.loc[idx, "entityId"] = entityId else: - # get the entity id corresponding to this row - entityId = row["entityId"] + # get the entity id corresponding to this row + entityId = row["entityId"] # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces) metadataSyn = {} for k, v in row.to_dict().items(): keySyn = se.get_class_label_from_display_name(str(k)) + # truncate annotation values to 500 characters if the + # size of values is greater than equal to 500 characters + # add an explicit [truncatedByDataCuratorApp] message at the end + # of every truncated message to indicate that the cell value + # has been truncated + if isinstance(v, str) and len(v) >= 500: + v = v[0:472] + "[truncatedByDataCuratorApp]" + metadataSyn[keySyn] = v # set annotation(s) for the various objects/items in a dataset on Synapse @@ -468,10 +497,13 @@ def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) #self.syn.set_annotations(metadataSyn) #-- deprecated code # update the manifest file, so that it contains the relevant entity IDs - manifest.to_csv(metadataManifestPath, index = False) + manifest.to_csv(metadataManifestPath, index=False) # store manifest to Synapse - manifestSynapseFile = File(metadataManifestPath, description = "Manifest for dataset " + datasetId, parent = datasetId) + manifestSynapseFile = File(metadataManifestPath, + description="Manifest for dataset " + + datasetId, + parent=datasetId) logger.info("Associated manifest file with dataset on Synapse.") manifestSynapseFileId = self.syn.store(manifestSynapseFile).id @@ -481,14 +513,13 @@ def associateMetadataWithFiles(self, metadataManifestPath: str, datasetId: str) def getFileAnnotations(self, fileId: str) -> Dict[str, str]: """Generate dictionary of annotations for the given Synapse file. - Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ". - + Args: fileId (str): Synapse ID for dataset file. - + Returns: dict: Annotations as comma-separated strings. """ @@ -521,22 +552,19 @@ def getFileAnnotations(self, fileId: str) -> Dict[str, str]: # Add the file entity ID and eTag, which weren't lists assert fileId == entity.id, ( "For some reason, the Synapse ID in the response doesn't match" - "the Synapse ID sent in the request (via synapseclient)." - ) + "the Synapse ID sent in the request (via synapseclient).") annotations["entityId"] = fileId annotations["eTag"] = entity.etag return annotations - def getDatasetAnnotations( - self, - datasetId: str, - fill_na: bool=True, - force_batch: bool=False - ) -> pd.DataFrame: + def getDatasetAnnotations(self, + datasetId: str, + fill_na: bool = True, + force_batch: bool = False) -> pd.DataFrame: """Generate table for annotations across all files in given dataset. - + Args: datasetId (str): Synapse ID for dataset folder. fill_na (bool): Whether to replace missing values with @@ -545,7 +573,7 @@ def getDatasetAnnotations( the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset. - + Returns: pd.DataFrame: Table of annotations. """ @@ -559,15 +587,17 @@ def getDatasetAnnotations( try_batch = len(dataset_files) >= 50 or force_batch if try_batch: try: - logger.info("Trying batch mode for retrieving Synapse annotations") - table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) + logger.info( + "Trying batch mode for retrieving Synapse annotations") + table = self.getDatasetAnnotationsBatch( + datasetId, dataset_file_ids) except (SynapseAuthenticationError, SynapseHTTPError): logger.info( f"Unable to create a temporary file view bound to {datasetId}. " - "Defaulting to slower iterative retrieval of annotations." - ) + "Defaulting to slower iterative retrieval of annotations.") # Default to the slower non-batch method - logger.info("Batch mode failed (probably due to permission error)") + logger.info( + "Batch mode failed (probably due to permission error)") try_batch = False # Non-batch mode @@ -598,14 +628,14 @@ def getDatasetAnnotations( def getDatasetProject(self, datasetId: str) -> str: """Get parent project for a given dataset ID. - + Args: datasetId (str): Synapse entity ID (folder or project). - + Raises: ValueError: Raised if Synapse ID cannot be retrieved by the user or if it doesn't appear in the file view. - + Returns: str: The Synapse ID for the parent project. """ @@ -633,27 +663,24 @@ def getDatasetProject(self, datasetId: str) -> str: raise ValueError( f"The given dataset ({datasetId}) doesn't appear in the " f"configured file view ({self.storageFileview}). This might " - "mean that the file view's scope needs to be updated." - ) + "mean that the file view's scope needs to be updated.") def getDatasetAnnotationsBatch( - self, - datasetId: str, - dataset_file_ids: Sequence[str]=None - ) -> pd.DataFrame: + self, + datasetId: str, + dataset_file_ids: Sequence[str] = None) -> pd.DataFrame: """Generate table for annotations across all files in given dataset. - This function uses a temporary file view to generate a table instead of iteratively querying for individual entity annotations. This function is expected to run much faster than `self.getDatasetAnnotationsBatch` on large datasets. - + Args: datasetId (str): Synapse ID for dataset folder. dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table. - + Returns: pd.DataFrame: Table of annotations. """ @@ -671,23 +698,18 @@ def getDatasetAnnotationsBatch( class DatasetFileView: """Helper class to create temporary dataset file views. - This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. - See SynapseStorage.getDatasetAnnotationsBatch for example usage. """ - - def __init__( - self, - datasetId: str, - synapse: Synapse, - name: str=None, - temporary: bool=True, - parentId: str=None - ) -> None: + def __init__(self, + datasetId: str, + synapse: Synapse, + name: str = None, + temporary: bool = True, + parentId: str = None) -> None: """Create a file view scoped to a dataset folder. - + Args: datasetId (str): Synapse ID for a dataset folder/project. synapse (Synapse): Used for Synapse requests. @@ -720,8 +742,7 @@ def __init__( scopes=self.datasetId, includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], addDefaultViewColumns=False, - addAnnotationColumns=True - ) + addAnnotationColumns=True) # TODO: Handle failure due to insufficient permissions by # creating a temporary new project to store view @@ -734,37 +755,38 @@ def __init__( # Ensure deletion of the file view (last resort) if self.is_temporary: atexit.register(self.delete) - - + + def __enter__(self): """Return file view when entering 'with' statement.""" return self - - + + def __exit__(self, exc_type, exc_value, traceback): """Delete file view when exiting 'with' statement.""" if self.is_temporary: self.delete() - - + + def delete(self): """Delete the file view on Synapse without deleting local table.""" if self.view_schema is not None: self.synapse.delete(self.view_schema) self.view_schema = None - - + + def query(self, tidy=True, force=False): """Retrieve file view as a data frame (raw format sans index).""" if self.table is None or force: fileview_id = self.view_schema["id"] - self.results = self.synapse.tableQuery(f"select * from {fileview_id}") + self.results = self.synapse.tableQuery( + f"select * from {fileview_id}") self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) if tidy: self.tidy_table() return self.table - - + + def tidy_table(self): """Convert raw file view data frame into more usable format.""" assert self.table is not None, "Must call `self.query()` first." @@ -772,8 +794,8 @@ def tidy_table(self): self._fix_list_columns() self._fix_int_columns() return self.table - - + + def _fix_default_columns(self): """Rename default columns to match schematic expectations.""" @@ -793,8 +815,8 @@ def _fix_default_columns(self): self.table.insert(len(self.table.columns), "eTag", row_etags) return self.table - - + + def _get_columns_of_type(self, types): """Helper function to get list of columns of a given type(s).""" matching_columns = [] @@ -802,8 +824,8 @@ def _get_columns_of_type(self, types): if header.columnType in types: matching_columns.append(header.name) return matching_columns - - + + def _fix_list_columns(self): """Fix formatting of list-columns.""" list_types = {'STRING_LIST', 'INTEGER_LIST', 'BOOLEAN_LIST'} @@ -811,8 +833,8 @@ def _fix_list_columns(self): for col in list_columns: self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) return self.table - - + + def _fix_int_columns(self): """Ensure that integer-columns are actually integers.""" int_columns = self._get_columns_of_type({"INTEGER"})