diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py new file mode 100644 index 0000000000..9562f574ad --- /dev/null +++ b/tools/pyclient/dev/graphql_data.py @@ -0,0 +1,28 @@ +""" +Development script for fetching data from a schema using the GraphQL API. +""" +from pprint import pprint + +from tools.pyclient.src.molgenis_emx2_pyclient import Client + +URL = "https://emx2.dev.molgenis.org" +SCHEMA = "catalogue" + + +def get_data() -> list: + """Fetches data.""" + + with Client(url=URL, schema=SCHEMA) as client: + resources = client.get(table='Resources', columns=['name', 'external identifiers'], as_df=True) + + + return resources + + + +if __name__ == '__main__': + data = get_data() + if isinstance(data, list): + pprint(data) + else: + print(data) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index f92b5bb1c4..d34992acde 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -1,24 +1,26 @@ import csv import json import logging -import sys import pathlib +import sys import time from functools import cache from io import BytesIO -from typing import Literal import pandas as pd import requests +from molgenis_emx2_pyclient.exceptions import NoSuchColumnException from requests import Response from . import graphql_queries as queries from . import utils +from .constants import HEADING, LOGO, NONREFS from .exceptions import (NoSuchSchemaException, ServiceUnavailableError, SigninError, ServerNotFoundError, PyclientException, NoSuchTableException, NoContextManagerException, GraphQLException, InvalidTokenException, PermissionDeniedException, TokenSigninException, NonExistentTemplateException) -from .metadata import Schema +from .metadata import Schema, Table +from .utils import parse_nested_pkeys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) log = logging.getLogger("Molgenis EMX2 Pyclient") @@ -372,7 +374,12 @@ def delete_records(self, table: str, schema: str = None, file: str = None, data: errors = '\n'.join([err['message'] for err in response.json().get('errors')]) log.error("Failed to delete data from %s::%s\n%s.", current_schema, table, errors) - def get(self, table: str, query_filter: str = None, schema: str = None, as_df: bool = False) -> list | pd.DataFrame: + def get(self, + table: str, + columns: list = None, + query_filter: str = None, + schema: str = None, + as_df: bool = False) -> list | pd.DataFrame: """Retrieves data from a schema and returns as a list of dictionaries or as a pandas DataFrame (as pandas is used to parse the response). @@ -403,17 +410,34 @@ def get(self, table: str, query_filter: str = None, schema: str = None, as_df: b table_id = schema_metadata.get_table(by='name', value=table).id filter_part = self._prepare_filter(query_filter, table, schema) - query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" - response = self.session.get(url=query_url) - self._validate_graphql_response(response=response, - fallback_error_message=f"Failed to retrieve data from {current_schema}::" - f"{table!r}.\nStatus code: {response.status_code}.") + if as_df: + query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" + response = self.session.get(url=query_url) + self._validate_graphql_response(response=response, + fallback_error_message=f"Failed to retrieve data from {current_schema}::" + f"{table!r}.\nStatus code: {response.status_code}.") - response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + if columns: + try: + response_data = response_data[columns] + except KeyError as e: + if "not in index" in e.args[0]: + raise NoSuchColumnException(f"Columns {e.args[0]}") + else: + raise NoSuchColumnException(f"Columns {e.args[0].split('Index(')[1].split(', dtype')}" + f" not in index.") + else: + query_url = f"{self.url}/{current_schema}/graphql" + query = self._parse_get_table_query(table_id, columns) + response = self.session.post(url=query_url, + json={"query": query}) + self._validate_graphql_response(response=response, + fallback_error_message=f"Failed to retrieve data from {current_schema}::" + f"{table!r}.\nStatus code: {response.status_code}.") + response_data = response.json().get('data').get(table_id) - if not as_df: - return response_data.to_dict('records') return response_data async def export(self, schema: str = None, table: str = None, @@ -1040,3 +1064,33 @@ def _validate_url(self): except requests.exceptions.MissingSchema: raise ServerNotFoundError(f"Invalid URL {self.url!r}. " f"Perhaps you meant 'https://{self.url}'?") + + def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: + """Gathers a table's metadata and parses it to a GraphQL query + for querying the table's contents. + """ + schema_metadata: Schema = self.get_schema_metadata() + table_metadata: Table = schema_metadata.get_table('id', table_id) + + query = f"{{\n {table_id} {{\n" + for col in table_metadata.columns: + if col.id not in columns and col.name not in columns: + continue + if col.get('columnType') in [HEADING, LOGO]: + continue + elif col.get('columnType') in NONREFS: + query += f" {col.get('id')}\n" + elif col.get('columnType').startswith('ONTOLOGY'): + query += f" {col.get('id')} {{name}}\n" + elif col.get('columnType').startswith('REF'): + query += f" {col.get('id')} {{" + pkeys = schema_metadata.get_pkeys(col.get('refTableId')) + query += parse_nested_pkeys(pkeys) + query += "}\n" + else: + log.warning(f"Caught column type {col.get('columnType')!r}.") + query += " }\n" + query += "}" + + return query + diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py new file mode 100644 index 0000000000..5cc380873f --- /dev/null +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -0,0 +1,27 @@ +""" +Constant column type metadata. +""" + +_ARRAY = "_ARRAY" +HEADING = "HEADING" +LOGO = "LOGO" +STRING = "STRING" +TEXT = "TEXT" +INT = "INT" +FLOAT = "FLOAT" +BOOL = "BOOL" +HYPERLINK = "HYPERLINK" +DATE = "DATE" +DATETIME = "DATETIME" + +STRING_ARRAY = STRING + _ARRAY +TEXT_ARRAY = TEXT + _ARRAY +INT_ARRAY = INT + _ARRAY +FLOAT_ARRAY = FLOAT + _ARRAY +BOOL_ARRAY = BOOL + _ARRAY +DATE_ARRAY = DATE + _ARRAY +DATETIME_ARRAY = DATETIME + _ARRAY +HYPERLINK_ARRAY = HYPERLINK + _ARRAY + +NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME, HYPERLINK, + STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, FLOAT_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY] diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py index 5d0bca116a..c731f86505 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py @@ -4,6 +4,7 @@ from itertools import starmap from typing import Literal +from .constants import STRING from .exceptions import NoSuchColumnException, NoSuchTableException @@ -160,6 +161,23 @@ def get(self, attr: str, default: object = None): return self.__getattribute__(attr) return default + def get_pkeys(self, table_id: str) -> list: + """Returns the primary keys of a table.""" + table_meta = self.get_table('id', table_id) + primary_columns = table_meta.get_columns(by='key', value=1) + + primary_keys = [] + for pc in primary_columns: + if pc.get('columnType').startswith('ONT'): + primary_keys.append({pc.id: 'name'}) + elif pc.get('columnType').startswith('REF'): + primary_keys.append({pc.id: self.get_pkeys(pc.get('refTableId'))}) + else: + primary_keys.append(pc.id) + + return primary_keys + + def get_table(self, by: Literal['id', 'name'], value: str) -> Table: """Gets the unique table by either id or name value. Raises NoSuchTableException if the table could not be retrieved from the schema. diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 75077ef7bf..2812f1d3ce 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -1,8 +1,9 @@ """ Utility functions for the Molgenis EMX2 Pyclient package """ +import logging + - def read_file(file_path: str) -> str: """Reads and imports data from a file. @@ -15,3 +16,25 @@ def read_file(file_path: str) -> str: data = stream.read() stream.close() return data + +def parse_nested_pkeys(pkeys: list) -> str: + """Converts a list of primary keys and nested primary keys to a string + suitable for inclusion in a GraphQL query. + """ + converted_pkeys = [] + for pk in pkeys: + if isinstance(pk, str): + converted_pkeys.append(pk) + elif isinstance(pk, dict): + for nested_key, nested_values in pk.items(): + converted_pkeys.append(nested_key) + converted_pkeys.append("{") + if isinstance(nested_values, str): + converted_pkeys.append(nested_values) + else: + converted_pkeys.append(parse_nested_pkeys(nested_values).strip()) + converted_pkeys.append("}") + else: + logging.warning(f"Unexpected data type encountered: {type(pk)!r}.") + + return " ".join(converted_pkeys) \ No newline at end of file