Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pyclient): add support for GraphQL API to 'get' method #4558

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions tools/pyclient/dev/graphql_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Development script for fetching data from a schema using the GraphQL API.
"""
from pprint import pprint

from tools.pyclient.src.molgenis_emx2_pyclient import Client

URL = "https://emx2.dev.molgenis.org"
SCHEMA = "catalogue"


def get_data() -> list:
"""Fetches data."""

with Client(url=URL, schema=SCHEMA) as client:
resources = client.get(table='Resources', columns=['name', 'external identifiers'], as_df=True)


return resources



if __name__ == '__main__':
data = get_data()
if isinstance(data, list):
pprint(data)
else:
print(data)
78 changes: 66 additions & 12 deletions tools/pyclient/src/molgenis_emx2_pyclient/client.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
import csv
import json
import logging
import sys
import pathlib
import sys
import time
from functools import cache
from io import BytesIO
from typing import Literal

import pandas as pd
import requests
from molgenis_emx2_pyclient.exceptions import NoSuchColumnException
from requests import Response

from . import graphql_queries as queries
from . import utils
from .constants import HEADING, LOGO, NONREFS
from .exceptions import (NoSuchSchemaException, ServiceUnavailableError, SigninError,
ServerNotFoundError, PyclientException, NoSuchTableException,
NoContextManagerException, GraphQLException, InvalidTokenException,
PermissionDeniedException, TokenSigninException, NonExistentTemplateException)
from .metadata import Schema
from .metadata import Schema, Table
from .utils import parse_nested_pkeys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger("Molgenis EMX2 Pyclient")
Expand Down Expand Up @@ -372,7 +374,12 @@ def delete_records(self, table: str, schema: str = None, file: str = None, data:
errors = '\n'.join([err['message'] for err in response.json().get('errors')])
log.error("Failed to delete data from %s::%s\n%s.", current_schema, table, errors)

def get(self, table: str, query_filter: str = None, schema: str = None, as_df: bool = False) -> list | pd.DataFrame:
def get(self,
table: str,
columns: list = None,
query_filter: str = None,
schema: str = None,
as_df: bool = False) -> list | pd.DataFrame:
"""Retrieves data from a schema and returns as a list of dictionaries or as
a pandas DataFrame (as pandas is used to parse the response).

Expand Down Expand Up @@ -403,17 +410,34 @@ def get(self, table: str, query_filter: str = None, schema: str = None, as_df: b
table_id = schema_metadata.get_table(by='name', value=table).id

filter_part = self._prepare_filter(query_filter, table, schema)
query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}"
response = self.session.get(url=query_url)

self._validate_graphql_response(response=response,
fallback_error_message=f"Failed to retrieve data from {current_schema}::"
f"{table!r}.\nStatus code: {response.status_code}.")
if as_df:
query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}"
response = self.session.get(url=query_url)
self._validate_graphql_response(response=response,
fallback_error_message=f"Failed to retrieve data from {current_schema}::"
f"{table!r}.\nStatus code: {response.status_code}.")

response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False)
response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False)
if columns:
try:
response_data = response_data[columns]
except KeyError as e:
if "not in index" in e.args[0]:
raise NoSuchColumnException(f"Columns {e.args[0]}")
else:
raise NoSuchColumnException(f"Columns {e.args[0].split('Index(')[1].split(', dtype')}"
f" not in index.")
else:
query_url = f"{self.url}/{current_schema}/graphql"
query = self._parse_get_table_query(table_id, columns)
response = self.session.post(url=query_url,
json={"query": query})
self._validate_graphql_response(response=response,
fallback_error_message=f"Failed to retrieve data from {current_schema}::"
f"{table!r}.\nStatus code: {response.status_code}.")
response_data = response.json().get('data').get(table_id)

if not as_df:
return response_data.to_dict('records')
return response_data

async def export(self, schema: str = None, table: str = None,
Expand Down Expand Up @@ -1040,3 +1064,33 @@ def _validate_url(self):
except requests.exceptions.MissingSchema:
raise ServerNotFoundError(f"Invalid URL {self.url!r}. "
f"Perhaps you meant 'https://{self.url}'?")

def _parse_get_table_query(self, table_id: str, columns: list = None) -> str:
"""Gathers a table's metadata and parses it to a GraphQL query
for querying the table's contents.
"""
schema_metadata: Schema = self.get_schema_metadata()
table_metadata: Table = schema_metadata.get_table('id', table_id)

query = f"{{\n {table_id} {{\n"
for col in table_metadata.columns:
if col.id not in columns and col.name not in columns:
continue
if col.get('columnType') in [HEADING, LOGO]:
continue
elif col.get('columnType') in NONREFS:
query += f" {col.get('id')}\n"
elif col.get('columnType').startswith('ONTOLOGY'):
query += f" {col.get('id')} {{name}}\n"
elif col.get('columnType').startswith('REF'):
query += f" {col.get('id')} {{"
pkeys = schema_metadata.get_pkeys(col.get('refTableId'))
query += parse_nested_pkeys(pkeys)
query += "}\n"
else:
log.warning(f"Caught column type {col.get('columnType')!r}.")
query += " }\n"
query += "}"

return query

27 changes: 27 additions & 0 deletions tools/pyclient/src/molgenis_emx2_pyclient/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Constant column type metadata.
"""

_ARRAY = "_ARRAY"
HEADING = "HEADING"
LOGO = "LOGO"
STRING = "STRING"
TEXT = "TEXT"
INT = "INT"
FLOAT = "FLOAT"
BOOL = "BOOL"
HYPERLINK = "HYPERLINK"
DATE = "DATE"
DATETIME = "DATETIME"

STRING_ARRAY = STRING + _ARRAY
TEXT_ARRAY = TEXT + _ARRAY
INT_ARRAY = INT + _ARRAY
FLOAT_ARRAY = FLOAT + _ARRAY
BOOL_ARRAY = BOOL + _ARRAY
DATE_ARRAY = DATE + _ARRAY
DATETIME_ARRAY = DATETIME + _ARRAY
HYPERLINK_ARRAY = HYPERLINK + _ARRAY

NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME, HYPERLINK,
STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, FLOAT_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY]
18 changes: 18 additions & 0 deletions tools/pyclient/src/molgenis_emx2_pyclient/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from itertools import starmap
from typing import Literal

from .constants import STRING
from .exceptions import NoSuchColumnException, NoSuchTableException


Expand Down Expand Up @@ -160,6 +161,23 @@ def get(self, attr: str, default: object = None):
return self.__getattribute__(attr)
return default

def get_pkeys(self, table_id: str) -> list:
"""Returns the primary keys of a table."""
table_meta = self.get_table('id', table_id)
primary_columns = table_meta.get_columns(by='key', value=1)

primary_keys = []
for pc in primary_columns:
if pc.get('columnType').startswith('ONT'):
primary_keys.append({pc.id: 'name'})
elif pc.get('columnType').startswith('REF'):
primary_keys.append({pc.id: self.get_pkeys(pc.get('refTableId'))})
else:
primary_keys.append(pc.id)

return primary_keys


def get_table(self, by: Literal['id', 'name'], value: str) -> Table:
"""Gets the unique table by either id or name value.
Raises NoSuchTableException if the table could not be retrieved from the schema.
Expand Down
25 changes: 24 additions & 1 deletion tools/pyclient/src/molgenis_emx2_pyclient/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""
Utility functions for the Molgenis EMX2 Pyclient package
"""
import logging



def read_file(file_path: str) -> str:
"""Reads and imports data from a file.

Expand All @@ -15,3 +16,25 @@ def read_file(file_path: str) -> str:
data = stream.read()
stream.close()
return data

def parse_nested_pkeys(pkeys: list) -> str:
"""Converts a list of primary keys and nested primary keys to a string
suitable for inclusion in a GraphQL query.
"""
converted_pkeys = []
for pk in pkeys:
if isinstance(pk, str):
converted_pkeys.append(pk)
elif isinstance(pk, dict):
for nested_key, nested_values in pk.items():
converted_pkeys.append(nested_key)
converted_pkeys.append("{")
if isinstance(nested_values, str):
converted_pkeys.append(nested_values)
else:
converted_pkeys.append(parse_nested_pkeys(nested_values).strip())
converted_pkeys.append("}")
else:
logging.warning(f"Unexpected data type encountered: {type(pk)!r}.")

return " ".join(converted_pkeys)