From 74c5cd47ab625743d3bf176ca95bb32144f62018 Mon Sep 17 00:00:00 2001 From: cl117 Date: Tue, 13 Aug 2024 22:52:27 -0600 Subject: [PATCH] optimize search.py --- flask/explorer.py | 56 +++- flask/index.py | 1 - flask/requirements.txt | 5 +- flask/search.py | 600 ++++++++++++++--------------------------- 4 files changed, 262 insertions(+), 400 deletions(-) diff --git a/flask/explorer.py b/flask/explorer.py index 7401d7e..ee30121 100644 --- a/flask/explorer.py +++ b/flask/explorer.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -from flask import Flask, request, jsonify, abort +from flask import Flask, request, jsonify, abort, render_template from werkzeug.exceptions import HTTPException import os @@ -15,11 +15,44 @@ import threading import time +from flask_debugtoolbar import DebugToolbarExtension +from flask_debugtoolbar_lineprofilerpanel.profile import line_profile + +def profile_flask_app(): + app.run(debug=True) + +if __name__ == "__main__": + #profiler = profile.Profile() + #profiler.enable() + profile_flask_app() + #profiler.disable() + #profiler.print_stats(sort='time') + log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) app = Flask(__name__) +app.config['SECRET_KEY'] = 'your-secret-key' # Required for the debug toolbar +app.config['DEBUG'] = True +app.config['DEBUG_TB_INTERCEPT_REDIRECTS'] = False +# Profiler configuration +app.config['DEBUG_TB_PROFILER_ENABLED'] = True +app.config['DEBUG_TB_PANELS'] = [ + 'flask_debugtoolbar.panels.versions.VersionDebugPanel', + 'flask_debugtoolbar.panels.timer.TimerDebugPanel', + 'flask_debugtoolbar.panels.headers.HeaderDebugPanel', + 'flask_debugtoolbar.panels.request_vars.RequestVarsDebugPanel', + 'flask_debugtoolbar.panels.config_vars.ConfigVarsDebugPanel', + 'flask_debugtoolbar.panels.template.TemplateDebugPanel', + 'flask_debugtoolbar.panels.logger.LoggingPanel', + 'flask_debugtoolbar.panels.profiler.ProfilerDebugPanel', + 'flask_debugtoolbar_lineprofilerpanel.panels.LineProfilerPanel' +] + +# Initialize the debug toolbar +toolbar = DebugToolbarExtension(app) + @app.errorhandler(Exception) def handle_error(e): @@ -36,9 +69,9 @@ def startup(): def auto_update_index(): while True: time.sleep(int(utils.get_config()['updateTimeInDays']) * 86400) - if utils.get_config()['autoUpdateIndex'] and utils.get_config()['updateTimeInDays'] > 0: - utils.log('Updating index automatically. To disable, set the \"autoUpdateIndex\" property in config.json to false.') - update() + # if utils.get_config()['autoUpdateIndex'] and utils.get_config()['updateTimeInDays'] > 0: + # utils.log('Updating index automatically. To disable, set the \"autoUpdateIndex\" property in config.json to false.') + # update() # Thread for automatically updaing the index periodically update_thread = threading.Thread(target=auto_update_index, daemon=True) @@ -64,7 +97,6 @@ def handle_error(e): utils.log('[ERROR] Returning error ' + str(e) + "\n Traceback:\n" + traceback.format_exc()) return jsonify(error=str(e)), 500 - @app.route('/info', methods=['GET']) def info(): utils.log('Explorer up!!! Virtutoso ' + str(query.memoized_query_sparql.cache_info())) @@ -161,8 +193,13 @@ def incremental_remove_collection(): except: raise +@app.route('/test', methods=['GET']) +@line_profile +def SBOLExplore_test_endpoint(): + return render_template('index.html') @app.route('/', methods=['GET']) +@line_profile def sparql_search_endpoint(): try: # make sure index is built, or throw exception @@ -173,7 +210,13 @@ def sparql_search_endpoint(): if sparql_query is not None: default_graph_uri = request.args.get('default-graph-uri') - response = jsonify(search.search(sparql_query, utils.get_uri2rank(), utils.get_clusters(), default_graph_uri)) + response = jsonify( + search.search( + sparql_query, + utils.get_uri2rank(), + utils.get_clusters(), + default_graph_uri + )) return response else: return "

Welcome to SBOLExplorer!

The available indices in Elasticsearch are shown below:


"\ @@ -183,6 +226,7 @@ def sparql_search_endpoint(): + "



Visit our GitHub repository!"\ + "

Any issues can be reported to our issue tracker."\ + "

Used by SynBioHub." + #return render_template('index.html') except: raise diff --git a/flask/index.py b/flask/index.py index 070ee7b..cb38309 100644 --- a/flask/index.py +++ b/flask/index.py @@ -3,7 +3,6 @@ import query import json - def add_pagerank(parts_response, uri2rank): """ Adds the pagerank score for each part diff --git a/flask/requirements.txt b/flask/requirements.txt index 7b90b71..0ebff0e 100644 --- a/flask/requirements.txt +++ b/flask/requirements.txt @@ -9,9 +9,10 @@ ipaddress==1.0.22 itsdangerous==0.24 Jinja2 MarkupSafe==2.0.1 -numpy==1.26.4 +numpy python-dateutil==2.7.3 requests==2.19.1 six==1.11.0 urllib3==1.23 -Werkzeug==2.1.2 +Werkzeug +apscheduler==3.10.4 diff --git a/flask/search.py b/flask/search.py index dd6a272..4d316db 100644 --- a/flask/search.py +++ b/flask/search.py @@ -1,18 +1,27 @@ import re +from typing import List, Dict, Tuple, Optional import utils import query import sequencesearch +# Compile regex patterns +FROM_COUNT_PATTERN = re.compile(r'SELECT \(count\(distinct \?subject\) as \?tempcount\)\s*(.*)\s*WHERE {') +FROM_NORMAL_PATTERN = re.compile(r'\?type\n(.*)\s*WHERE {') +CRITERIA_PATTERN = re.compile(r'WHERE {\s*(.*)\s*\?subject a \?type \.') +OFFSET_PATTERN = re.compile(r'OFFSET (\d+)') +LIMIT_PATTERN = re.compile(r'LIMIT (\d+)') +SEQUENCE_PATTERN = re.compile(r'\s*\?subject sbol2:sequence \?seq \.\s*\?seq sbol2:elements \"([a-zA-Z]*)\"') +FLAG_PATTERN = re.compile(r'# flag_([a-zA-Z0-9._]*): ([a-zA-Z0-9./-_]*)') +KEYWORD_PATTERN = re.compile(r"CONTAINS\(lcase\(\?displayId\), lcase\('([^']*)'\)\)") -def search_es(es_query): + +def extract_offset(sparql_query): + offset_match = OFFSET_PATTERN.search(sparql_query) + return int(offset_match.group(1)) if offset_match else 0 + +def search_es(es_query: str) -> Dict: """ - String query for ES searches - - Arguments: - es_query {string} -- String to search for - - Returns: - List -- List of all search results + String query for ES searches. """ body = { 'query': { @@ -45,26 +54,15 @@ def search_es(es_query): } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def empty_search_es(offset, limit, allowed_graphs): +def empty_search_es(offset: int, limit: int, allowed_graphs: List[str]) -> Dict: """ - Empty string search based solely on pagerank - - Arguments: - offset {int} -- Offset for search results - limit {int} -- Size of search - allowed_graphs {List} -- List of allowed graphs to search on - - Returns: - List -- List of search results + Empty string search based solely on pagerank. """ - if len(allowed_graphs) == 1: - query = {'term': {'graph': allowed_graphs[0]}} - else: - query = {'terms': {'graph': allowed_graphs}} + query = {'term': {'graph': allowed_graphs[0]}} if len(allowed_graphs) == 1 else {'terms': {'graph': allowed_graphs}} body = { 'query': { @@ -82,20 +80,13 @@ def empty_search_es(offset, limit, allowed_graphs): } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def search_es_allowed_subjects(es_query, allowed_subjects): +def search_es_allowed_subjects(es_query: str, allowed_subjects: List[str]) -> Dict: """ - String query for ES searches limited to allowed parts - - Arguments: - es_query {string} -- String to search for - allowed_subjects {list} - list of allowed subjects from Virtuoso - - Returns: - List -- List of all search results + String query for ES searches limited to allowed parts. """ body = { 'query': { @@ -107,7 +98,7 @@ def search_es_allowed_subjects(es_query, allowed_subjects): 'query': es_query, 'fields': [ 'subject', - 'displayId^3', # caret indicates displayId is 3 times as important during search + 'displayId^3', 'version', 'name', 'description', @@ -117,37 +108,29 @@ def search_es_allowed_subjects(es_query, allowed_subjects): 'operator': 'or', 'fuzziness': 'AUTO' }}, - {'ids': {'values': list(allowed_subjects)}} + {'ids': {'values': allowed_subjects}} ] } }, 'script_score': { 'script': { - 'source': "_score * Math.log(doc['pagerank'].value + 1)" # Math.log is a natural log + 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - }, - - }, - + } + } }, 'from': 0, 'size': 10000 } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise - -def search_es_allowed_subjects_empty_string(allowed_subjects): +def search_es_allowed_subjects_empty_string(allowed_subjects: List[str]) -> Dict: """ - ES search purely limited to allowed parts - - Arguments: - allowed_subjects {list} - list of allowed subjects from Virtuoso - - Returns: - List -- List of all search results + ES search purely limited to allowed parts. """ body = { 'query': { @@ -155,27 +138,57 @@ def search_es_allowed_subjects_empty_string(allowed_subjects): 'query': { 'bool': { 'must': [ - {'ids': {'values': list(allowed_subjects)}} + {'ids': {'values': allowed_subjects}} ] } }, 'script_score': { 'script': { - 'source': "_score * Math.log(doc['pagerank'].value + 1)" # Math.log is a natural log + 'source': "_score * Math.log(doc['pagerank'].value + 1)" } - }, - - }, - + } + } }, 'from': 0, 'size': 10000 } try: return utils.get_es().search(index=utils.get_config()['elasticsearch_index_name'], body=body) - except: + except Exception as e: + utils.log(f"ES search failed: {e}") raise +def parse_sparql_query(sparql_query, is_count_query): + # Find FROM clause + _from_search = FROM_COUNT_PATTERN.search(sparql_query) if is_count_query else FROM_NORMAL_PATTERN.search(sparql_query) + _from = _from_search.group(1).strip() if _from_search else '' + + # Find criteria + criteria_search = CRITERIA_PATTERN.search(sparql_query) + criteria = criteria_search.group(1).strip() if criteria_search else '' + + # Find offset + offset_match = OFFSET_PATTERN.search(sparql_query) + offset = int(offset_match.group(1)) if offset_match else 0 + # Find limit + limit_match = LIMIT_PATTERN.search(sparql_query) + limit = int(limit_match.group(1)) if limit_match else 50 + + # Find sequence + sequence_match = SEQUENCE_PATTERN.search(sparql_query) + sequence = sequence_match.group(1) if sequence_match else '' + + # Extract flags + flags = {match.group(1): match.group(2) for match in FLAG_PATTERN.finditer(sparql_query)} + + # Extract keywords + keywords = KEYWORD_PATTERN.findall(criteria) + + # Construct es_query + es_query = ' '.join(keywords).strip() + print("Hello es_query: ", es_query) + + return es_query, _from, criteria, offset, limit, sequence, flags def extract_query(sparql_query): """ @@ -187,225 +200,75 @@ def extract_query(sparql_query): Returns: List -- List of information extracted """ - _from = '' - if is_count_query(sparql_query): - _from_search = re.search(r'''SELECT \(count\(distinct \?subject\) as \?tempcount\)\s*(.*)\s*WHERE {''', - sparql_query) - else: - _from_search = re.search(r'''\?type\n(.*)\s*WHERE {''', sparql_query) - if _from_search: - _from = _from_search.group(1).strip() - - criteria = '' - criteria_search = re.search(r'''WHERE {\s*(.*)\s*\?subject a \?type \.''', sparql_query) - if criteria_search: - criteria = criteria_search.group(1).strip() - - offset = 0 - offset_search = re.search(r'''OFFSET (\d*)''', sparql_query) - if offset_search: - offset = int(offset_search.group(1)) - - limit = 50 - limit_search = re.search(r'''LIMIT (\d*)''', sparql_query) - if limit_search: - limit = int(limit_search.group(1)) - - sequence = '' - sequence_search = re.search(r'''\s*\?subject sbol2:sequence \?seq \.\s*\?seq sbol2:elements \"([a-zA-Z]*)\"''', - sparql_query) - if sequence_search: - sequence = sequence_search.group(1) - - flags = {} - flag_search = re.finditer(r'''# flag_([a-zA-Z0-9._]*): ([a-zA-Z0-9./-_]*)''', sparql_query) - for flag in flag_search: - flags[flag.group(1)] = flag.group(2) - - extract_keyword_re = re.compile(r'''CONTAINS\(lcase\(\?displayId\), lcase\('([^']*)'\)\)''') - keywords = [] - for keyword in re.findall(extract_keyword_re, criteria): - keywords.append(keyword) - es_query = ' '.join(keywords).strip() - - return es_query, _from, criteria, offset, limit, sequence, flags - - -def extract_allowed_graphs(_from, default_graph_uri): - """ - Extracts the allowed graphs to search over + return parse_sparql_query(sparql_query, is_count_query(sparql_query)) - Arguments: - _from {string} -- Graph where search originated - default_graph_uri {string} -- The default graph URI pulled from SBH - Returns: - List -- List of allowed graphs +def extract_allowed_graphs(_from: str, default_graph_uri: str) -> List[str]: """ - allowed_graphs = [] - + Extracts the allowed graphs to search over. + """ + allowed_graphs = [default_graph_uri] if not _from else [graph.strip()[1:-1] for graph in _from.split('FROM') if graph.strip()] if utils.get_config()['distributed_search']: - instances = utils.get_wor() - for instance in instances: - allowed_graphs.append(instance['instanceUrl'] + '/public') + allowed_graphs.extend(instance['instanceUrl'] + '/public' for instance in utils.get_wor()) + return allowed_graphs - if _from == '': - allowed_graphs.append(default_graph_uri) - return allowed_graphs - else: - for graph in _from.split('FROM'): - graph = graph.strip() - graph = graph[1:len(graph) - 1] - - if graph != '': - allowed_graphs.append(graph) - - return allowed_graphs - - -def is_count_query(sparql_query): +def is_count_query(sparql_query: str) -> bool: return 'SELECT (count(distinct' in sparql_query - -def create_response(count, bindings, return_count): +def create_response(count: int, bindings: List[Dict], return_count: bool) -> Dict: """ - Creates response to be sent back to SBH - - Arguments: - count {int} -- ? - bindings {Dict} -- The bindings - return_count {int} -- ? - - Returns: - ? -- ? + Creates response to be sent back to SBH. """ if return_count: - response = {"head": - {"link": [], "vars": ["count"]}, - "results": {"distinct": False, "ordered": True, - "bindings": [{"count": { - "type": "typed-literal", - "datatype": "http://www.w3.org/2001/XMLSchema#integer", - "value": "10"}}]}} - response['results']['bindings'][0]['count']['value'] = str(count) - else: - response = {"head": {"link": [], - "vars": ["subject", "displayId", "version", "name", "description", "type", "percentMatch", - "strandAlignment", "CIGAR"]}, - "results": {"distinct": False, "ordered": True, "bindings": []}} - response['results']['bindings'] = bindings - - return response - + return { + "head": {"link": [], "vars": ["count"]}, + "results": { + "distinct": False, + "ordered": True, + "bindings": [{"count": { + "type": "typed-literal", + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "value": str(count) + } + }] + } + } + return { + "head": { + "link": [], + "vars": ["subject", "displayId", "version", "name", "description", "type", "percentMatch", "strandAlignment", "CIGAR"] + }, + "results": {"distinct": False, "ordered": True, "bindings": bindings} + } -def create_binding(subject, displayId, version, name, description, _type, role, sbol_type, order_by, percentMatch=-1, - strandAlignment='N/A', CIGAR='N/A'): +def create_binding(subject: str, displayId: Optional[str], version: Optional[int], name: Optional[str], description: Optional[str], + _type: Optional[str], role: Optional[str], sbol_type: Optional[str], order_by: Optional[float], + percentMatch: float = -1, strandAlignment: str = 'N/A', CIGAR: str = 'N/A') -> Dict: """ - Creates bindings to be sent to SBH - - Arguments: - subject {string} -- URI of part - displayId {string} -- DisplayId of part - version {int} -- Version of part - name {string} -- Name of part - description {string} -- Description of part - _type {string} -- SBOL type of part - role {string} -- S.O. role of part - order_by {?} -- ? - - Keyword Arguments: - percentMatch {number} -- Percent match of query part to the target part (default: {-1}) - strandAlignment {str} -- Strand alignment of the query part relatve to the target part (default: {'N/A'}) - CIGAR {str} -- Alignment of query part relative to the target part (default: {'N/A'}) - - Returns: - Dict -- Part and its information + Creates bindings to be sent to SBH. """ binding = {} - - if subject is not None: - binding["subject"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": subject - } - - if displayId is not None: - binding["displayId"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": displayId - } - - if version is not None: - binding["version"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": version - } - - if name is not None: - binding["name"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": name - } - - if description is not None: - binding["description"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": description - } - - if _type is not None: - binding["type"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": _type - } - - if role is not None: - binding["role"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": role - } - - if sbol_type is not None: - binding["sboltype"] = { - "type": "uri", - "datatype": "http://www.w3.org/2001/XMLSchema#uri", - "value": sbol_type - } - - if order_by is not None: - binding["order_by"] = order_by - - if percentMatch != -1: - binding["percentMatch"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": str(percentMatch) - } - - if strandAlignment != 'N/A': - binding["strandAlignment"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": strandAlignment - } - - if CIGAR != 'N/A': - binding["CIGAR"] = { - "type": "literal", - "datatype": "http://www.w3.org/2001/XMLSchema#string", - "value": CIGAR - } - + attributes = { + "subject": subject, + "displayId": displayId, + "version": str(version) if version is not None else None, + "name": name, + "description": description, + "type": _type, + "role": role, + "sboltype": sbol_type, + "order_by": order_by, + "percentMatch": str(percentMatch) if percentMatch != -1 else None, + "strandAlignment": strandAlignment if strandAlignment != 'N/A' else None, + "CIGAR": CIGAR if CIGAR != 'N/A' else None + } + for key, value in attributes.items(): + if value is not None: + datatype = "http://www.w3.org/2001/XMLSchema#uri" if key in ["subject", "type", "role", "sboltype"] else "http://www.w3.org/2001/XMLSchema#string" + ltype = "uri" if key in ["subject", "type", "role", "sboltype"] else "literal" + binding[key] = {"type": ltype, "value": str(value), "datatype": datatype} if not key=="order_by" else order_by return binding - def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None): """ Creates the mass binding consisting of all parts in the search @@ -424,40 +287,43 @@ def create_bindings(es_response, clusters, allowed_graphs, allowed_subjects=None bindings = [] cluster_duplicates = set() + allowed_subjects_set = set(allowed_subjects) if allowed_subjects else None + for hit in es_response['hits']['hits']: _source = hit['_source'] _score = hit['_score'] subject = _source['subject'] - if allowed_subjects is not None and subject not in allowed_subjects: + if allowed_subjects_set and subject not in allowed_subjects_set: continue - if _source.get('graph') not in allowed_graphs: + graph = _source.get('graph') + if graph not in allowed_graphs: continue if subject in cluster_duplicates: - _score = _score / 2.0 + _score /= 2.0 elif subject in clusters: cluster_duplicates.update(clusters[subject]) - if _source.get('type') is not None and 'http://sbols.org/v2#Sequence' in _source.get('type'): - _score = _score / 10.0 - - binding = create_binding(subject, - _source.get('displayId'), - _source.get('version'), - _source.get('name'), - _source.get('description'), - _source.get('type'), - _source.get('role'), - _source.get('sboltype'), - _score - ) + if 'http://sbols.org/v2#Sequence' in _source.get('type', ''): + _score /= 10.0 + + binding = create_binding( + subject, + _source.get('displayId'), + _source.get('version'), + _source.get('name'), + _source.get('description'), + _source.get('type'), + _source.get('role'), + _source.get('sboltype'), + _score + ) bindings.append(binding) return bindings - def create_criteria_bindings(criteria_response, uri2rank, sequence_search=False, ucTableName=''): """ Creates binding for all non-string or non-empty searches @@ -474,47 +340,45 @@ def create_criteria_bindings(criteria_response, uri2rank, sequence_search=False, Dict -- Binding of parts """ bindings = [] - parts = (p for p in criteria_response if p.get('role') is None or 'http://wiki.synbiohub.org' in p.get('role')) - for part in parts: + for part in criteria_response: subject = part.get('subject') + pagerank = uri2rank.get(subject, 1) - if subject not in uri2rank: - pagerank = 1 - else: - pagerank = uri2rank[subject] - - if part.get('type') is not None and 'http://sbols.org/v2#Sequence' in part.get('type'): - pagerank = pagerank / 10.0 + if 'http://sbols.org/v2#Sequence' in part.get('type', ''): + pagerank /= 10.0 if sequence_search: - pagerank = pagerank * (float(get_percent_match(part.get('subject'), ucTableName)) / 100) - binding = create_binding(part.get('subject'), - part.get('displayId'), - part.get('version'), - part.get('name'), - part.get('description'), - part.get('type'), - part.get('role'), - part.get('sboltype'), - pagerank, - get_percent_match(part.get('subject'), ucTableName), - get_strand_alignment(part.get('subject'), ucTableName), - get_cigar_data(part.get('subject'), ucTableName)) - + percent_match = float(get_percent_match(subject, ucTableName)) / 100 + binding = create_binding( + subject, + part.get('displayId'), + part.get('version'), + part.get('name'), + part.get('description'), + part.get('type'), + part.get('role'), + part.get('sboltype'), + pagerank * percent_match, + percent_match, + get_strand_alignment(subject, ucTableName), + get_cigar_data(subject, ucTableName) + ) else: - binding = create_binding(part.get('subject'), - part.get('displayId'), - part.get('version'), - part.get('name'), - part.get('description'), - part.get('type'), - part.get('role'), - part.get('sboltype'), - pagerank) + binding = create_binding( + subject, + part.get('displayId'), + part.get('version'), + part.get('name'), + part.get('description'), + part.get('type'), + part.get('role'), + part.get('sboltype'), + pagerank + ) bindings.append(binding) - return bindings + return bindings def get_allowed_subjects(criteria_response): """ @@ -525,15 +389,10 @@ def get_allowed_subjects(criteria_response): Returns: Parts the user is allowed to see """ - subjects = set() - - for part in criteria_response: - subjects.add(part['subject']) - - return subjects - + return {part['subject'] for part in criteria_response} def create_similar_criteria(criteria, clusters): + """ Adds filter to query to be sent to Virtuoso Args: @@ -548,7 +407,8 @@ def create_similar_criteria(criteria, clusters): if subject not in clusters or not clusters[subject]: return 'FILTER (?subject != ?subject)' - return 'FILTER (' + ' || '.join(['?subject = <' + duplicate + '>' for duplicate in clusters[subject]]) + ')' + filters = ' || '.join(f'?subject = <{duplicate}>' for duplicate in clusters[subject]) + return f'FILTER ({filters})' def create_sequence_criteria(criteria, uris): @@ -561,10 +421,10 @@ def create_sequence_criteria(criteria, uris): Returns: String containing a SPARQL filter """ - if len(uris) == 0: + if not uris: return '' - - return 'FILTER (' + ' || '.join(['?subject = <' + uri + '>' for uri in uris]) + ')' + filters = ' || '.join(f'?subject = <{uri}>' for uri in uris) + return f'FILTER ({filters})' def parse_allowed_graphs(allowed_graphs): @@ -576,12 +436,7 @@ def parse_allowed_graphs(allowed_graphs): Returns: List of allowed graphs """ - result = '' - for allowed_graph in allowed_graphs: - if allowed_graph is not None: - result += 'FROM <' + allowed_graph + '> ' - return result - + return ' '.join(f'FROM <{graph}>' for graph in allowed_graphs if graph) def search(sparql_query, uri2rank, clusters, default_graph_uri): """ @@ -596,7 +451,7 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): """ es_query, _from, criteria, offset, limit, sequence, flags = extract_query(sparql_query) - + if criteria.strip() == 'FILTER ()': criteria = '' @@ -610,17 +465,16 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): allowed_uris = filter_sequence_search_subjects(_from, results) criteria_response = query.query_parts(_from) # Filter searches by URI to hide private parts here instead of on Virtuoso - criteria_response_filtered = [c for c in criteria_response if any(f for f in allowed_uris if f in c.get('subject'))] + criteria_response_filtered = [c for c in criteria_response if any(f in c.get('subject', '') for f in allowed_uris)] bindings = create_criteria_bindings(criteria_response_filtered, uri2rank, True, filename[:-4] + '.uc') - elif len(sequence.strip()) > 0: + elif sequence.strip(): # send sequence search to search.py temp_filename = sequencesearch.write_to_temp(sequence) results = sequencesearch.sequence_search(flags, temp_filename) - allowed_uris = filter_sequence_search_subjects(_from, results) criteria_response = query.query_parts(_from) - criteria_response_filtered = [c for c in criteria_response if any(f for f in allowed_uris if f in c.get('subject'))] + criteria_response_filtered = [c for c in criteria_response if any(f in c.get('subject', '') for f in allowed_uris)] bindings = create_criteria_bindings(criteria_response_filtered, uri2rank, True, temp_filename[:-4] + '.uc') elif 'SIMILAR' in criteria: @@ -634,37 +488,40 @@ def search(sparql_query, uri2rank, clusters, default_graph_uri): criteria_response = query.query_parts(_from, criteria) bindings = create_criteria_bindings(criteria_response, uri2rank) - elif es_query == '' and filterless_criteria == '': + elif es_query == '' and not filterless_criteria: # empty search es_response = empty_search_es(offset, limit, allowed_graphs) bindings = create_bindings(es_response, clusters, allowed_graphs) - bindings.sort(key=lambda binding: binding['order_by'], reverse=True) + bindings.sort(key=lambda b: b['order_by'], reverse=True) return create_response(es_response['hits']['total'], bindings, is_count_query(sparql_query)) else: - - if filterless_criteria == '': + if not filterless_criteria: es_response = search_es(es_query) # pure string search bindings = create_bindings(es_response, clusters, allowed_graphs) - else: # advanced search and string search criteria_response = query.query_parts(_from, filterless_criteria) allowed_subjects = get_allowed_subjects(criteria_response) - if es_query == '': - es_allowed_subject = search_es_allowed_subjects_empty_string(allowed_subjects) - else: - es_allowed_subject = search_es_allowed_subjects(es_query, allowed_subjects) + es_allowed_subject = (search_es_allowed_subjects_empty_string(allowed_subjects) + if es_query == '' + else search_es_allowed_subjects(es_query, allowed_subjects)) bindings = create_bindings(es_allowed_subject, clusters, allowed_graphs, allowed_subjects) utils.log('Advanced string search complete.') - bindings.sort(key=lambda binding: binding['order_by'], reverse=True) - + bindings.sort(key=lambda b: b['order_by'], reverse=True) return create_response(len(bindings), bindings[offset:offset + limit], is_count_query(sparql_query)) +def get_info_from_uc_table(uri, ucTableName, column_index): + with open(ucTableName, 'r') as file: + for line in file: + parts = line.split() + if parts[9] == uri: + return parts[column_index] + return 'N/A' def get_percent_match(uri, ucTableName): """ @@ -676,16 +533,7 @@ def get_percent_match(uri, ucTableName): Returns: Percent match if available, else -1 """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[3] - - return -1 + return get_info_from_uc_table(uri, ucTableName, 3) def get_strand_alignment(uri, ucTableName): @@ -698,38 +546,10 @@ def get_strand_alignment(uri, ucTableName): Returns: + or - """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[4] - - return 'N/A' - + return get_info_from_uc_table(uri, ucTableName, 4) def get_cigar_data(uri, ucTableName): - """ - Gets the CIGAR data of a part (see https://genome.sph.umich.edu/wiki/SAM) - Args: - uri: URI of the part - ucTableName: UClust table - - Returns: CIGAR data if found, or N/A - - """ - with open(ucTableName, 'r') as read: - uc_reader = read.read() - lines = uc_reader.splitlines() - - for line in lines: - line = line.split() - if line[9] == uri: - return line[7] - - return 'N/A' + return get_info_from_uc_table(uri, ucTableName, 7) def filter_sequence_search_subjects(_from, uris): """ @@ -741,9 +561,7 @@ def filter_sequence_search_subjects(_from, uris): _from {list} -- List of allowed graphs uris {list} -- List of URI's from sequence search """ - from_uris = [] - result = re.findall(r"\<([A-Za-z0-9:\/.]+)\>*", _from) - for r in result: - from_uris.append(r) - - return [uri for uri in uris if any(f for f in from_uris if f in uri)] + from_uris = set(re.findall(r"\<([A-Za-z0-9:\/.]+)\>*", _from)) + return [uri for uri in uris if any(f in uri for f in from_uris)] + +