From e7a2c7d7a548af9f86e49863169ab38b456b9418 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 20 Nov 2024 10:22:23 -0500 Subject: [PATCH 01/78] Adding /browse view --- CHANGELOG.rst | 6 + pyproject.toml | 2 +- src/encoded/browse.py | 54 ++++ .../static/components/browse/BrowseView.js | 242 ++++++++++++++++++ src/encoded/static/components/index.js | 5 + 5 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 src/encoded/browse.py create mode 100644 src/encoded/static/components/browse/BrowseView.js diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 94441a01d..4f8150a20 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,12 @@ smaht-portal Change Log ---------- +0.113.0 +======= +* 2024-11-20/dmichaels +* Added module browse.py for /browse; adapted from fourfront/.../search.py/browse. + + 0.112.3 ======= * 2024-11-08/dmichaels diff --git a/pyproject.toml b/pyproject.toml index 40c210c40..cacaa4d34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.112.3" +version = "0.112.4" description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/src/encoded/browse.py b/src/encoded/browse.py new file mode 100644 index 000000000..d5bd028ea --- /dev/null +++ b/src/encoded/browse.py @@ -0,0 +1,54 @@ +import structlog +from pyramid.view import view_config +from webob.multidict import MultiDict +from pyramid.httpexceptions import HTTPFound +from urllib.parse import urlencode +from snovault.search.search import search +from snovault.util import debug_log + +log = structlog.getLogger(__name__) + +# 2024-11-19/dmichaels: Adapted from fourfront for C4-1184. + +def includeme(config): + config.add_route('browse', '/browse{slash:/?}') + config.scan(__name__) + + +# DEFAULT_BROWSE_TYPE = "FileSet" +DEFAULT_BROWSE_TYPE = "OutputFile" +DEFAULT_BROWSE_PARAM_LISTS = { + "type": [DEFAULT_BROWSE_TYPE], + "additional_facet": ["file_size"] +} + +@view_config(route_name='browse', request_method='GET', permission='search') +@debug_log +def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=False): + """ + Simply use search results for browse view + Redirect to proper URL w. params if needed + """ + orig_params = request.params + for k,vals in DEFAULT_BROWSE_PARAM_LISTS.items(): + if k not in orig_params or orig_params[k] not in vals: + # Redirect to DEFAULT_BROWSE_PARAM_LISTS URL + next_qs = MultiDict() + for k2, v2list in DEFAULT_BROWSE_PARAM_LISTS.items(): + for v2 in v2list: + next_qs.add(k2, v2) + # Preserve other keys that arent in DEFAULT_BROWSE_PARAM_LISTS + for k2, v2 in orig_params.items(): + if k2 not in DEFAULT_BROWSE_PARAM_LISTS: + next_qs.add(k2, v2) + # next_qs.add("redirected_from", str(request.path_qs)) + return HTTPFound( + location=str(request.path) + '?' + urlencode(next_qs), + detail="Redirected from " + str(request.path_info) + ) + + # TODO + # Returning forced_type="Search" for now as there is not + # yet any "Browse" UI for /browse; only "Search" for /search. + # return search(context, request, search_type, return_generator, forced_type="Search") + return search(context, request, search_type, return_generator, forced_type="Browse") diff --git a/src/encoded/static/components/browse/BrowseView.js b/src/encoded/static/components/browse/BrowseView.js new file mode 100644 index 000000000..08ce8ae84 --- /dev/null +++ b/src/encoded/static/components/browse/BrowseView.js @@ -0,0 +1,242 @@ +'use strict'; + +import React from 'react'; +import memoize from 'memoize-one'; +import _ from 'underscore'; +import url from 'url'; + +import { + memoizedUrlParse, + schemaTransforms, + analytics, +} from '@hms-dbmi-bgm/shared-portal-components/es/components/util'; +import { SearchView as CommonSearchView } from '@hms-dbmi-bgm/shared-portal-components/es/components/browse/SearchView'; +import { DetailPaneStateCache } from '@hms-dbmi-bgm/shared-portal-components/es/components/browse/components/DetailPaneStateCache'; +import { columnExtensionMap } from './columnExtensionMap'; +import { Schemas } from './../util'; +import { + TitleAndSubtitleBeside, + PageTitleContainer, + TitleAndSubtitleUnder, + pageTitleViews, + EditingItemPageTitle, +} from './../PageTitleSection'; + +export default function BrowseView(props) { + const { + context: { '@type': searchPageType = ['ItemSearchResults'] }, + } = props; + const isCaseSearch = searchPageType[0] === 'CaseSearchResults'; + + if (isCaseSearch) { + return ( + + + + ); + } + + return ; +} + +export class BrowseViewBody extends React.PureComponent { + /** + * Function which is passed into a `.filter()` call to + * filter context.facets down, usually in response to frontend-state. + * + * Currently is meant to filter out type facet if we're in selection mode, + * as well as some fields from embedded 'experiment_set' which might + * give unexpected results. + * + * @todo Potentially get rid of this and do on backend. + * + * @param {{ field: string }} facet - Object representing a facet. + * @returns {boolean} Whether to keep or discard facet. + */ + static filterFacet(facet, currentAction) { + // Set in backend or schema for facets which are under development or similar. + if (facet.hide_from_view) return false; + + // Remove the @type facet while in selection mode. + if (facet.field === 'type' && currentAction === 'selection') + return false; + + return true; + } + + /** Filter the `@type` facet options down to abstract types only (if none selected) for Search. */ + static transformedFacets(context, currentAction, schemas) { + // Clone/filter list of facets. + // We may filter out type facet completely at this step, + // in which case we can return out of func early. + const facets = context.facets.filter(function (facet) { + return BrowseViewBody.filterFacet(facet, currentAction); + }); + + // Find facet for '@type' + const searchItemTypes = + schemaTransforms.getAllSchemaTypesFromSearchContext(context); // "Item" is excluded + + if (searchItemTypes.length > 0) { + console.info( + "A (non-'Item') type filter is present. Will skip filtering Item types in Facet." + ); + // Keep all terms/leaf-types - backend should already filter down to only valid sub-types through + // nature of search itself. + + if (searchItemTypes.length > 1) { + const errMsg = + 'More than one "type" filter is selected. This is intended to not occur, at least as a consequence of interacting with the UI. Perhaps have entered multiple types into URL.'; + analytics.exception('CGAP SearchView - ' + errMsg); + console.warn(errMsg); + } + + return facets; + } + + const typeFacetIndex = _.findIndex(facets, { field: 'type' }); + if (typeFacetIndex === -1) { + console.error( + 'Could not get type facet, though some filter for it is present.' + ); + return facets; // Facet not present, return. + } + + // Avoid modifying in place. + facets[typeFacetIndex] = _.clone(facets[typeFacetIndex]); + + // Show only base types for when itemTypesInSearch.length === 0 (aka 'type=Item'). + facets[typeFacetIndex].terms = _.filter( + facets[typeFacetIndex].terms, + function (itemType) { + const parentType = schemaTransforms.getAbstractTypeForType( + itemType.key, + schemas + ); + return !parentType || parentType === itemType.key; + } + ); + + return facets; + } + + /** Not currently used. */ + static filteredFilters(filters) { + const typeFilterCount = filters.reduce(function (m, { field }) { + if (field === 'type') return m + 1; + return m; + }, 0); + return filters.filter(function ({ field, term }) { + if (field === 'type') { + if (term === 'Item') { + return false; + } + if (typeFilterCount === 1) { + return false; + } + } + return true; + }); + } + + constructor(props) { + super(props); + this.memoized = { + transformedFacets: memoize(BrowseViewBody.transformedFacets), + filteredFilters: memoize(BrowseViewBody.filteredFilters), + }; + } + + render() { + const { + isCaseSearch = false, + context, + currentAction, + schemas, + } = this.props; + + // We don't need full screen btn on CGAP as already full width. + const passProps = _.omit( + this.props, + 'isFullscreen', + 'toggleFullScreen', + 'isCaseSearch' + ); + + //const filters = BrowseView.filteredFilters(context.filters || []); + const facets = this.memoized.transformedFacets( + context, + currentAction, + schemas + ); + const tableColumnClassName = 'results-column col'; + const facetColumnClassName = 'facets-column col-auto'; + + return ( +
+ + HELLO: THIS IS BROWSE-VIEW! +
+ ); + } +} + +const BrowseViewPageTitle = React.memo(function BrowseViewPageTitle(props) { + const { context, schemas, currentAction, alerts } = props; + + if (currentAction === 'add') { + // Fallback unless any custom PageTitles registered for @type=SearchResults & currentAction=add + return ( + + ); + } + + if (currentAction === 'selection' || currentAction === 'multiselect') { + return ( + + + Selecting + + + ); + } + + const thisTypeTitle = schemaTransforms.getSchemaTypeFromSearchContext( + context, + schemas + ); + const subtitle = thisTypeTitle ? ( + + for {thisTypeTitle} + + ) : null; + + return ( + + + Search + + + ); +}); + +pageTitleViews.register(BrowseViewPageTitle, 'Browse'); +pageTitleViews.register(BrowseViewPageTitle, 'Browse', 'selection'); +pageTitleViews.register(BrowseViewPageTitle, 'Browse', 'add'); diff --git a/src/encoded/static/components/index.js b/src/encoded/static/components/index.js index 1227a21dd..e0d041ac9 100644 --- a/src/encoded/static/components/index.js +++ b/src/encoded/static/components/index.js @@ -27,6 +27,7 @@ import DocumentView from './item-pages/DocumentView'; import StaticSectionView from './item-pages/StaticSectionView'; import SMaHTSubmissionView from './forms/SMaHTSubmissionView'; import SearchView from './browse/SearchView'; +import BrowseView from './browse/BrowseView'; import FileView from './item-pages/FileView'; /** @@ -59,6 +60,10 @@ content_views.register(SearchView, 'Search'); content_views.register(SearchView, 'Search', 'selection'); content_views.register(SearchView, 'Search', 'multiselect'); +content_views.register(BrowseView, 'Browse'); +content_views.register(BrowseView, 'Browse', 'selection'); +content_views.register(BrowseView, 'Browse', 'multiselect'); + // Fallback for anything we haven't registered content_views.fallback = function () { return FallbackView; From b7b6de6bea478bf7f8b21d52fb29b19b5f649e10 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 20 Nov 2024 10:47:48 -0500 Subject: [PATCH 02/78] Adding /browse view --- src/encoded/__init__.py | 1 + src/encoded/browse.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/encoded/__init__.py b/src/encoded/__init__.py index 67a2a4fb6..9fa9f9453 100644 --- a/src/encoded/__init__.py +++ b/src/encoded/__init__.py @@ -313,6 +313,7 @@ def main(global_config, **local_config): if 'elasticsearch.server' in config.registry.settings: config.include('snovault.elasticsearch') config.include('snovault.search.search') + config.include('encoded.browse') config.include('snovault.search.compound_search') # this contains fall back url, so make sure it comes just before static_resoruces diff --git a/src/encoded/browse.py b/src/encoded/browse.py index d5bd028ea..b312d0ba7 100644 --- a/src/encoded/browse.py +++ b/src/encoded/browse.py @@ -16,10 +16,15 @@ def includeme(config): # DEFAULT_BROWSE_TYPE = "FileSet" -DEFAULT_BROWSE_TYPE = "OutputFile" +# DEFAULT_BROWSE_TYPE = "UnalignedReads" +# DEFAULT_BROWSE_TYPE = "OutputFile" + +DEFAULT_BROWSE_TYPE = "File" +DEFAULT_BROWSE_FACETS = ["file_size"] + DEFAULT_BROWSE_PARAM_LISTS = { "type": [DEFAULT_BROWSE_TYPE], - "additional_facet": ["file_size"] + "additional_facet": DEFAULT_BROWSE_FACETS } @view_config(route_name='browse', request_method='GET', permission='search') @@ -48,7 +53,5 @@ def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=F ) # TODO - # Returning forced_type="Search" for now as there is not - # yet any "Browse" UI for /browse; only "Search" for /search. - # return search(context, request, search_type, return_generator, forced_type="Search") + # No real /browse specific UI yet; initially just basically copied static/components/SearchView.js to BrowseView.js. return search(context, request, search_type, return_generator, forced_type="Browse") From d39c93cb59d9622b96db4ad5cf66d72c25998f1a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 20 Nov 2024 10:53:11 -0500 Subject: [PATCH 03/78] comments in CHANGELOG.rst --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4f8150a20..ad981bf1b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,7 @@ Change Log ======= * 2024-11-20/dmichaels * Added module browse.py for /browse; adapted from fourfront/.../search.py/browse. + This is for ticket: https://hms-dbmi.atlassian.net/browse/C4-1184 0.112.3 From a39073be5cce0287458e1c0d0b61c2cc80abb1e1 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 20 Nov 2024 10:57:13 -0500 Subject: [PATCH 04/78] udpate version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cacaa4d34..57240bdaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.112.4" +version = "0.113.0" description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" From 566ff2a0fa6f5a3d799eee2d5900d154a41c8d3f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 6 Dec 2024 14:03:31 -0500 Subject: [PATCH 05/78] changes for new /recent_files_summary endpoint for C4-1192 --- CHANGELOG.rst | 22 ++- pyproject.toml | 2 +- src/encoded/browse.py | 14 +- src/encoded/elasticsearch_utils.py | 212 ++++++++++++++++++++++++++++ src/encoded/endpoint_utils.py | 178 +++++++++++++++++++++++ src/encoded/item_utils/file.py | 23 ++- src/encoded/recent_files_summary.py | 210 +++++++++++++++++++++++++++ src/encoded/types/file.py | 54 +++++++ 8 files changed, 698 insertions(+), 17 deletions(-) create mode 100644 src/encoded/elasticsearch_utils.py create mode 100644 src/encoded/endpoint_utils.py create mode 100644 src/encoded/recent_files_summary.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 71e4becac..a8649660f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,12 +7,30 @@ smaht-portal Change Log ---------- -0.115.0 +0.117.0 ======= -* 2024-11-20/dmichaels +* 2024-11-20/dmichaels - branch: dmichaels-20241119-browse-view (PR-295) + * Added module browse.py for /browse; adapted from fourfront/.../search.py/browse. This is for ticket: https://hms-dbmi.atlassian.net/browse/C4-1184 +* New endpoint /recent_files_summary which, by default, returns info for files released + within the past three months grouped by release-date, cell-line or donor, and + ile-description. The specific fields used for these groupings are: + - release-date: file_status_tracking.released + - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code + - donor: donors.display_title + - file-dsecription: release_tracker_description + Note that release_tracker_description is a newer (2024-12) calcprop (PR-298/sn_file_release_tracker); + and included in this branch are these files from the branch sn_file_release_tracker: + - src/encoded/item_utils/file.py + - src/encoded/types/file.py + Added these new modules to support this new endpoint: + - src/encoded/recent_files_summary.py + - src/encoded/elasticsearch_utils.py (maybe move to dcicutils eventually) + - src/encoded/endpoint_utils.py (maybe move to dcicutils eventually) + This is for ticket: https://hms-dbmi.atlassian.net/browse/C4-1192 + 0.114.0 ======= diff --git a/pyproject.toml b/pyproject.toml index 5b006392c..4907ddc16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.115.0" +version = "0.117.0" description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/src/encoded/browse.py b/src/encoded/browse.py index b312d0ba7..3ad80270d 100644 --- a/src/encoded/browse.py +++ b/src/encoded/browse.py @@ -1,10 +1,12 @@ -import structlog +from pyramid.httpexceptions import HTTPBadRequest, HTTPFound +from pyramid.security import Authenticated from pyramid.view import view_config +import structlog from webob.multidict import MultiDict -from pyramid.httpexceptions import HTTPFound from urllib.parse import urlencode from snovault.search.search import search from snovault.util import debug_log +from encoded.recent_files_summary import recent_files_summary log = structlog.getLogger(__name__) @@ -12,6 +14,7 @@ def includeme(config): config.add_route('browse', '/browse{slash:/?}') + config.add_route("recent_files_summary_endpoint", "/recent_files_summary") config.scan(__name__) @@ -55,3 +58,10 @@ def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=F # TODO # No real /browse specific UI yet; initially just basically copied static/components/SearchView.js to BrowseView.js. return search(context, request, search_type, return_generator, forced_type="Browse") + + +@view_config(route_name="recent_files_summary_endpoint", request_method=["GET"], effective_principals=Authenticated) +@debug_log +def recent_files_summary_endpoint(context, request): + results = recent_files_summary(request) + return results diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py new file mode 100644 index 000000000..c45258d5f --- /dev/null +++ b/src/encoded/elasticsearch_utils.py @@ -0,0 +1,212 @@ +from copy import deepcopy +from typing import Any, Callable, List, Optional, Tuple + + +def create_elasticsearch_aggregation_query(fields: List[str], + aggregation_property_name: Optional[str] = None, + max_buckets: Optional[int] = None, + missing_value: Optional[str] = None, + create_field_aggregation: Optional[Callable] = None) -> dict: + + global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE + + if not (isinstance(fields, list) and fields and isinstance(field := fields[0], str) and field): + return {} + if not isinstance(missing_value, str): + missing_value = AGGREGATION_NO_VALUE + if not (isinstance(max_buckets, int) and (max_buckets > 0)): + max_buckets = AGGREGATION_MAX_BUCKETS + + if not (callable(create_field_aggregation) and + isinstance(field_aggregation := create_field_aggregation(field), dict)): + field_aggregation = { + "terms": { + "field": f"embedded.{field}.raw", + "missing": missing_value, + "size": max_buckets + } + } + + if not (isinstance(aggregation_property_name, str) and aggregation_property_name): + aggregation_property_name = field + aggregation = {aggregation_property_name: field_aggregation} + aggregation[aggregation_property_name]["meta"] = {"field_name": field} + + if nested_aggregation := create_elasticsearch_aggregation_query( + fields[1:], max_buckets=max_buckets, + missing_value=missing_value, + create_field_aggregation=create_field_aggregation): + aggregation[aggregation_property_name]["aggs"] = nested_aggregation + + return aggregation + + +def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bool = False) -> Optional[dict]: + + def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: + if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): + if isinstance(field_name := aggregation.get("meta", {}).get("field_name"), str) and field_name: + if isinstance(aggregation_key, str) and aggregation_key: + if field_name != aggregation_key: + return None + return field_name + return None + + def get_nested_aggregation(aggregation: dict) -> Optional[dict]: + if isinstance(aggregation, dict): + for key in aggregation: + if get_aggregation_key(aggregation[key], key): + return aggregation[key] + return None + + def get_aggregation_bucket_value(aggregation_bucket: dict) -> Optional[Any]: + if isinstance(aggregation_bucket, dict): + return aggregation_bucket.get("key_as_string", aggregation_bucket.get("key")) + return None + + def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: + if isinstance(aggregation_bucket, dict): + if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + return doc_count + return None + + def get_aggregation_buckets_doc_count(aggregation: dict): + buckets_doc_count = 0 + if get_aggregation_key(aggregation): + for aggregation_bucket in aggregation["buckets"]: + if (doc_count := get_aggregation_bucket_doc_count(aggregation_bucket)) is not None: + buckets_doc_count += doc_count + return buckets_doc_count + + def find_aggregation_bucket(aggregation: dict, value: str) -> Optional[dict]: + if get_aggregation_key(aggregation): + for aggregation_bucket in aggregation["buckets"]: + if get_aggregation_bucket_value(aggregation_bucket) == value: + return aggregation_bucket + return None + + def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[int]]: + merged_item_count = 0 + if not ((aggregation_key := get_aggregation_key(source)) and (get_aggregation_key(target) == aggregation_key)): + return None, None + for source_bucket in source["buckets"]: + if (((source_bucket_value := get_aggregation_bucket_value(source_bucket)) is None) or + ((source_bucket_item_count := get_aggregation_bucket_doc_count(source_bucket)) is None)): # noqa + continue + if (target_bucket := find_aggregation_bucket(target, source_bucket_value)): + if source_nested_aggregation := get_nested_aggregation(source_bucket): + if target_nested_aggregation := get_nested_aggregation(target_bucket): + merged_item_count, _ = merge_results(target_nested_aggregation, source_nested_aggregation) + if merged_item_count is None: + if source_nested_aggregation_key := get_aggregation_key(source_nested_aggregation): + target_bucket[source_nested_aggregation_key] = \ + source_bucket[source_nested_aggregation_key] + target_bucket["doc_count"] += \ + get_aggregation_buckets_doc_count(source_bucket[source_nested_aggregation_key]) + elif merged_item_count > 0: + target_bucket["doc_count"] += merged_item_count + elif get_aggregation_bucket_value(target_bucket) is not None: + if get_aggregation_bucket_doc_count(target_bucket) is not None: + target_bucket["doc_count"] += source_bucket_item_count + merged_item_count += source_bucket_item_count + continue + return merged_item_count, target + + if copy is True: + target = deepcopy(target) + return merge_results(target, source)[1] + + +def normalize_elasticsearch_aggregation_results(aggregation: dict, + sort: bool = False, + additional_properties: Optional[dict] = None, + remove_empty_items: bool = True) -> dict: + + def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): + if isinstance(field_name := aggregation.get("meta", {}).get("field_name"), str) and field_name: + if isinstance(aggregation_key, str) and aggregation_key: + if field_name != aggregation_key: + return None + return field_name + return None + + def get_aggregation_bucket_value(aggregation_bucket: dict) -> Optional[Any]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation_bucket, dict): + return aggregation_bucket.get("key_as_string", aggregation_bucket.get("key")) + return None + + def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation_bucket, dict): + if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + return doc_count + return None + + def get_nested_aggregations(data: dict) -> List[dict]: + results = [] + if isinstance(data, dict): + for key in data: + if get_aggregation_key(data[key]): + results.append(data[key]) + if (not results) and data.get("buckets", list): + results.append(data) + return results + + def find_group_item(group_items: List[dict], value: Any) -> Optional[dict]: + if isinstance(group_items, list): + for group_item in group_items: + if isinstance(group_item, dict) and (value == group_item.get("value")): + return group_item + return None + + def normalize_results(aggregation: dict, + key: Optional[str] = None, value: Optional[str] = None, + additional_properties: Optional[dict] = None) -> dict: + nonlocal remove_empty_items + if not (aggregation_key := get_aggregation_key(aggregation)): + return {} + group_items = [] ; item_count = 0 # noqa + for bucket in aggregation["buckets"]: + if (((bucket_value := get_aggregation_bucket_value(bucket)) is None) or + ((bucket_item_count := get_aggregation_bucket_doc_count(bucket)) is None)): # noqa + continue + item_count += bucket_item_count + if nested_aggregations := get_nested_aggregations(bucket): + for nested_aggregation in nested_aggregations: + if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): + if group_item := find_group_item(group_items, bucket_value): + # group_item["items"].extend(normalized_aggregation["items"]) + for normalized_aggregation_item in normalized_aggregation["items"]: + group_item["items"].append(normalized_aggregation_item) + group_item["count"] += normalized_aggregation_item["count"] + else: + group_item = normalized_aggregation + group_items.append(group_item) + else: + if (remove_empty_items is False) or (bucket_item_count > 0): + group_item = {"name": aggregation_key, "value": bucket_value, "count": bucket_item_count} + group_items.append(group_item) + if (remove_empty_items is not False) and (not group_items): + return {} + results = {"name": key, "value": value, "count": item_count, "items": group_items} + if isinstance(additional_properties, dict) and additional_properties: + results = {**additional_properties, **results} + if key is None: + del results["name"] + if value is None: + del results["value"] + return results + + def sort_results(data: dict) -> None: + if isinstance(data, dict) and isinstance(items := data.get("items"), list): + items.sort(key=lambda item: (-item.get("count", 0), item.get("value", ""))) + for item in items: + sort_results(item) + + results = normalize_results(aggregation, additional_properties=additional_properties) + if sort is True: + sort_results(results) + return results diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py new file mode 100644 index 000000000..f03d09328 --- /dev/null +++ b/src/encoded/endpoint_utils.py @@ -0,0 +1,178 @@ +import calendar +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +import pyramid +from typing import Any, List, Optional, Tuple, Union +from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string + + +def request_arg(request: pyramid.request.Request, name: str, fallback: Optional[str] = None) -> Optional[str]: + return str(value).strip() if (value := request.params.get(name, None)) is not None else fallback + + +def request_arg_int(request: pyramid.request.Request, name: str, fallback: Optional[int] = 0) -> Optional[Any]: + if (value := request_arg(request, name)) is not None: + try: + return int(value) + except Exception: + pass + return fallback + + +def request_arg_bool(request: pyramid.request.Request, name: str, fallback: Optional[bool] = False) -> Optional[bool]: + return fallback if (value := request_arg(request, name)) is None else (value.lower() == "true") + + +def request_args(request: pyramid.request.Request, + name: str, fallback: Optional[str] = None, duplicates: bool = False) -> List[str]: + args = [] + if isinstance(value := request.params.getall(name), list): + # Note that request.paramss.getall always returns a list, + # even if the named query parameter is not specified at all. + if value == []: + if request.params.get(name) is None: + # Only return the fallback if the named query parameter was not specified at all. + return fallback + for item in value: + if isinstance(item, str) and (item := item.strip()): + if (item not in args) or (duplicates is True): + args.append(item) + return args + + +def parse_date_range_related_arguments( + from_date: Optional[Union[str, datetime, date]], + thru_date: Optional[Union[str, datetime, date]], + nmonths: Optional[Union[str, int]] = None, + include_current_month: bool = True, + strings: bool = False) -> Tuple[Optional[Union[str, datetime]], Optional[Union[str, datetime]]]: + + """ + Returns from/thru dates based on the given from/thru date arguments and optional nmonths argument. + Given dates may be datetime or date objects or strings. Returned dates are datetime objects, or + if the the given strings arguments is True, then strings (formatted as YYYY-MM-DD). + + If both of the given from/thru dates are specified/valid then those are returned + and the given nmonths argument is not used. + + If only the given from date is specified then a None thru date is returned, UNLESS the given nmonths + argument represents a positive integer, in which case the returned thru date will be nmonths months + subsequent to the given from date; or if the given nmonths represents zero, in which case the + returned thru date will be the last date of the month of the given from date. + + If only the given thru date is specified then a None from date is returned, UNLESS the given nmonths + argument represents a negative integer, in which case the returned from date will be nmonths monthss + previous to the given thru date; or if the given nmonths represents zero, in which case + the returned from date will be the first date of the month of the given thru date. + + If neither the given from/thru dates are specified then None is returns for both, UNLESS the given + nmonths arguments represents a non-zero integer, in which case the returned from/thru dates will represent + the past (absolute value) nmonths months starting with the month previous to the month of "today"; however + if the include_current_month is True it is rather the past nmonths starting with the month of "today". + """ + from_date = parse_datetime_string(from_date, notz=True) + thru_date = parse_datetime_string(thru_date, last_day_of_month_if_no_day=True, notz=True) + if not isinstance(nmonths, int): + if isinstance(nmonths, str) and (nmonths := nmonths.strip()): + try: + nmonths = int(nmonths) + except Exception: + nmonths = 0 + else: + nmonths = 0 + if from_date: + if (not thru_date) and isinstance(nmonths, int): + if nmonths > 0: + thru_date = _add_months(from_date, nmonths) + elif nmonths == 0: + thru_date = _get_last_date_of_month(from_date) + elif thru_date: + if isinstance(nmonths, int): + if nmonths < 0: + from_date = _add_months(thru_date, nmonths) + elif nmonths == 0: + from_date = _get_first_date_of_month(thru_date) + elif isinstance(nmonths, int) and ((nmonths := abs(nmonths)) != 0): + # If no (valid) from/thru dates given, but the absolute value of nmonths is a non-zero integer, then returns + # from/thru dates for the last nmonths month ending with the last day of month previous to the current month. + # thru_date = _add_months(_get_last_date_of_month(), -1) + thru_date = _get_last_date_of_month() + if include_current_month is not True: + thru_date = _add_months(thru_date, -1) + from_date = _add_months(thru_date, -nmonths) + if strings is True: + return (from_date.strftime(f"%Y-%m-%d") if from_date else None, + thru_date.strftime(f"%Y-%m-%d") if thru_date else None) + return from_date, thru_date + + +def parse_datetime_string(value: Union[str, datetime, date], + last_day_of_month_if_no_day: bool = False, + notz: bool = False) -> Optional[datetime]: + """ + Wrapper around dcicutils.datetime_utils.parse_datetime_string to handle a few special cases for convenience. + """ + last_day_of_month = False + if not isinstance(value, datetime): + if isinstance(value, date): + value = datetime.combine(value, datetime.min.time()) + elif isinstance(value, str): + if (len(value) == 8) and value.isdigit(): + # Special case to accept for example "20241206" to mean "2024-12-06". + value = f"{value[0:4]}-{value[4:6]}-{value[6:8]}" + elif (len(value) == 7) and (value[4] == "-") and value[0:4].isdigit() and value[5:].isdigit(): + # Special case to accept for example "2024-10" to mean "2024-10-01". + value = f"{value}-01" + last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 7) and (value[2] == "/") and value[0:2].isdigit() and value[3:].isdigit(): + # Special case to accept for example "11/2024" to mean "2024-11-01". + value = f"{value[3:]}-{value[0:2]}-01" + last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 6) and (value[1] == "/") and value[0:1].isdigit() and value[2:].isdigit(): + # Special case to accept for example "9/2024" to mean "2024-09-01". + value = f"{value[2:]}-0{value[0:1]}-01" + last_day_of_month = last_day_of_month_if_no_day + if not (value := dcicutils_parse_datetime_string(value)): + return None + else: + return None + value = value.replace(tzinfo=None) if notz is True else value + if last_day_of_month: + value = _get_last_date_of_month(value) + return value + + +def _get_first_date_of_month(day: Optional[Union[datetime, date, str]] = None) -> datetime: + """ + Returns a datetime object representing the first day of the month of the given date; + this given date may be a datetime or date object, or string representing a date or + datetime; if the given argument is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day, notz=True)): + day = datetime.today().replace(tzinfo=None) + return day.replace(day=1) + + +def _get_last_date_of_month(day: Optional[Union[datetime, date, str]] = None) -> datetime: + """ + Returns a datetime object representing the last day of the month of the given date; + this given date may be a datetime or date object, or string representing a date or + datetime; if the given argument is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day)): + day = datetime.today().replace(tzinfo=None) + return datetime(day.year, day.month, calendar.monthrange(day.year, day.month)[1]) + + +def _add_months(day: Optional[Union[datetime, date, str]] = None, nmonths: int = 0) -> datetime: + """ + Returns a datetime object representing the given date with the given nmonths number of months + added (or substracted if negative) to (or from) that given date.; this given date may be a + datetime or date object, or string representing a date or datetime; if the given argument + is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day, notz=True)): + day = datetime.today().replace(tzinfo=None) + if isinstance(nmonths, int) and (nmonths != 0): + return day + relativedelta(months=nmonths) + return day diff --git a/src/encoded/item_utils/file.py b/src/encoded/item_utils/file.py index 595bae82f..2305adec9 100644 --- a/src/encoded/item_utils/file.py +++ b/src/encoded/item_utils/file.py @@ -76,6 +76,11 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any return properties.get("reference_genome", "") +def get_gene_annotation(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]: + """Get gene annotation from properties.""" + return properties.get("gene_annotation", "") + + def get_file_sets(properties: Dict[str, Any]) -> List[Union[str, Dict[str, Any]]]: """Get file sets from properties.""" return properties.get("file_sets", []) @@ -412,17 +417,11 @@ def has_mobile_element_insertions(file: Dict[str, Any]) -> bool: return "MEI" in get_data_type(file) -def get_associated_files_status( - file: Dict[str, Any], request_handler: RequestHandler, at_id: str -) -> List[str]: - """Get associated files status from the FileSet.files_status calcprop""" - return get_property_values_from_identifiers( - request_handler, - get_file_sets(file), - partial(file_set.get_associated_files_status, request_handler, at_id) - ) - - def get_override_group_coverage(file: Dict[str, Any]) -> str: """Get override group coverage from properties.""" - return file.get("override_group_coverage","") \ No newline at end of file + return file.get("override_group_coverage","") + + +def get_release_tracker_description(file: Dict[str, Any]) -> str: + """Get release tracker description from properties.""" + return file.get("release_tracker_description","") diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py new file mode 100644 index 000000000..a51319692 --- /dev/null +++ b/src/encoded/recent_files_summary.py @@ -0,0 +1,210 @@ +import pyramid +from typing import List, Optional +from urllib.parse import urlencode +from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query +from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results +from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results +from encoded.endpoint_utils import parse_date_range_related_arguments +from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int +from snovault.search.search import search as snovault_search +from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq + +QUERY_FILE_TYPES = ["OutputFile"] +QUERY_FILE_STATUSES = ["released"] +QUERY_FILE_CATEGORIES = ["!Quality Control"] +QUERY_RECENT_MONTHS = 3 +QUERY_INCLUDE_CURRENT_MONTH = True + +AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" +AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" +AGGREGATION_FIELD_DONOR = "donors.display_title" +AGGREGATION_FIELD_FILE_DESCRIPTOR = "release_tracker_description" + +AGGREGATION_MAX_BUCKETS = 100 +AGGREGATION_NO_VALUE = "No value" + + +def recent_files_summary(request: pyramid.request.Request) -> dict: + """ + This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, + by default, info for files released withing the past three months grouped by release-date, + cell-line or donor, and file-description. The specific fields used for these groupings are: + + - release-date: file_status_tracking.released + - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code + - donor: donors.display_title + - file-dsecription: release_tracker_description + + Note that release_tracker_description is a newer (2024-12) + calculated property - see PR-298 (branch: sn_file_release_tracker). + + By default the current (assuminging partial) month IS included, so we really return info for + the past FULL three months plus for whatever time has currently elapsed for the current month. + Use pass the include_current_month=false query argument to NOT include the current month. + + The number of months of data can be controlled using the nmonths query argument, e.g. nmonths=6. + + A specific date range can also be passed in e.g. using from_date=2024-08-01 and thru_date=2024-10-31. + + For testing purposes, a date field other than the default file_status_tracking.released can + also be specified using the date_property_name query argument. And file statuses other than + released can be queried for using one or more status query arguments, e.g. status=uploaded. + """ + + date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) + max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) + nosort = request_arg_bool(request, "nosort") + debug = request_arg_bool(request, "debug") + debug_query = request_arg_bool(request, "debug_query") + raw = request_arg_bool(request, "raw") + + def create_query(request: pyramid.request.Request) -> str: + + global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES + nonlocal date_property_name + + types = request_args(request, "type", QUERY_FILE_TYPES) + statuses = request_args(request, "status", QUERY_FILE_STATUSES) + categories = request_args(request, "category", QUERY_FILE_CATEGORIES) + recent_months = request_arg_int(request, "nmonths", request_arg_int(request, "months", QUERY_RECENT_MONTHS)) + from_date = request_arg(request, "from_date") + thru_date = request_arg(request, "thru_date") + include_current_month = request_arg_bool(request, "include_current_month", QUERY_INCLUDE_CURRENT_MONTH) + + from_date, thru_date = parse_date_range_related_arguments(from_date, thru_date, nmonths=recent_months, + include_current_month=include_current_month, + strings=True) + query_parameters = { + "type": types if types else None, + "status": statuses if statuses else None, + "data_category": categories if categories else None, + f"{date_property_name}.from": from_date if from_date else None, + f"{date_property_name}.to": thru_date if from_date else None, + "from": 0, + "limit": 0 + } + query_parameters = {key: value for key, value in query_parameters.items() if value is not None} + query_string = urlencode(query_parameters, True) + # Hackishness to change "=!" to "!=" in search_param_lists value for e.g. to turn this in the + # query_parameters above "data_category": ["!Quality Control"] into: data_category&21=Quality+Control + query_string = query_string.replace("=%21", "%21=") + return f"/search/?{query_string}" + + def create_aggregations_query(aggregation_fields: List[str]) -> dict: + global AGGREGATION_NO_VALUE + nonlocal date_property_name, max_buckets + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal date_property_name + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + create_field_aggregation=create_field_aggregation) + return aggregation_query[date_property_name] + + def execute_query(request: pyramid.request.Request, query: str, aggregations_query: dict) -> str: + request = snovault_make_search_subreq(request, path=query, method="GET") + results = snovault_search(None, request, custom_aggregations=aggregations_query) + return results + + query = create_query(request) + + aggregations_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + aggregations_by_donor = [ + date_property_name, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + aggregations_query = { + "group_by_cell_line": create_aggregations_query(aggregations_by_cell_line), + "group_by_donor": create_aggregations_query(aggregations_by_donor) + } + + if debug_query: + return {"query": query, "aggregations_query": aggregations_query} + + raw_results = execute_query(request, query, aggregations_query) + + # Note that the doc_count values returned by ElasticSearch do actually seem to be for unique items, + # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 + # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), + # then it its doc_count will not count it twice. This creates a situation where it might look like the counts + # are wrong in this returned merged/normalized result set where the outer item count is less than the sum of + # the individual counts withni each sub-group. For example, the below result shows a top-level doc_count of 1 + # even though there are 2 documents, 1 in the HG00438 group and the other in the HG005 it would be because + # the same unique file has a cell_line.code of both HG00438 and HG005. + # { + # "meta": { "field_name": "file_status_tracking.released" }, + # "buckets": [ + # { + # "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 1, + # "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + # "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + # "buckets": [ + # { "key": "HG00438", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "WGS Illumina NovaSeq X bam", "doc_count": 1 }, + # ] + # } + # }, + # { "key": "HG005", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "Fiber-seq PacBio Revio bam", "doc_count": 1 } + # ] + # } + # } + # ] + # } + # } + # ] + # } + + if raw: + # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. + if debug: + raw_results = {"query": query, "aggregations_query": aggregations_query, "raw_results": raw_results} + elif "@id" in raw_results: + # Unless we do this we get redirect to the URL in this field, for example + # to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control + # &file_status_tracking.released.from=2024-09-30 + # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' + del raw_results["@id"] + return raw_results + + if not (raw_results := raw_results.get("aggregations")): + return {} + + raw_results_by_cell_line = raw_results.get("group_by_cell_line") + raw_results_by_donor = raw_results.get("group_by_donor") + merged_results = merge_elasticsearch_aggregation_results(raw_results_by_cell_line, raw_results_by_donor) + additional_properties = {"query": query, "aggregations_query": aggregations_query} if debug else None + return normalize_elasticsearch_aggregation_results(merged_results, sort=not nosort, + additional_properties=additional_properties) diff --git a/src/encoded/types/file.py b/src/encoded/types/file.py index 1c7f456b8..bb0f50245 100644 --- a/src/encoded/types/file.py +++ b/src/encoded/types/file.py @@ -62,6 +62,7 @@ from ..item_utils.utils import ( get_property_value_from_identifier, get_property_values_from_identifiers, + get_unique_values, RequestHandler, ) @@ -252,6 +253,10 @@ class CalcPropConstants: } }, } + RELEASE_TRACKER_DESCRIPTION = { + "title": "Release Tracker Description", + "type": "string", + } SAMPLE_SUMMARY_DONOR_IDS = "donor_ids" SAMPLE_SUMMARY_TISSUES = "tissues" SAMPLE_SUMMARY_SAMPLE_NAMES = "sample_names" @@ -695,6 +700,22 @@ def analysis_summary( reference_genome=reference_genome, ) + @calculated_property(schema=CalcPropConstants.RELEASE_TRACKER_DESCRIPTION) + def release_tracker_description( + self, + request: Request, + file_sets: Optional[List[str]] = None + ) -> Union[str, None]: + """Get file release tracker description for display on home page.""" + result = None + if file_sets: + request_handler = RequestHandler(request=request) + result = self._get_release_tracker_description( + request_handler, + file_properties=self.properties + ) + return result + def _get_libraries( self, request: Request, file_sets: Optional[List[str]] = None ) -> List[str]: @@ -979,6 +1000,39 @@ def _get_analysis_summary_fields( ), } return {key: value for key, value in to_include.items() if value} + + def _get_release_tracker_description( + self, + request_handler: RequestHandler, + file_properties: Dict[str, Any], + ) -> Union[str, None]: + """Get release tracker description for display on the home page.""" + assay_title= get_unique_values( + request_handler.get_items(file_utils.get_assays(file_properties, request_handler)), + item_utils.get_display_title, + ) + sequencer_title = get_unique_values( + request_handler.get_items( + file_utils.get_sequencers(file_properties, request_handler)), + item_utils.get_display_title, + ) + file_format_title = get_property_value_from_identifier( + request_handler, + file_utils.get_file_format(file_properties), + item_utils.get_display_title, + ) + if len(assay_title) > 1 or len(sequencer_title) > 1: + # More than one unique assay or sequencer + return "" + elif len(assay_title) == 0 or len(sequencer_title) == 0: + # No assay or sequencer + return "" + to_include = [ + assay_title[0], + sequencer_title[0], + file_format_title + ] + return " ".join(to_include) @view_config(name='drs', context=File, request_method='GET', From 6148956e3a4079175de1ce94a87306cc264dc908 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 7 Dec 2024 00:38:26 -0500 Subject: [PATCH 06/78] debugging --- src/encoded/elasticsearch_utils.py | 25 ++++++++---- src/encoded/recent_files_summary.py | 61 ++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index c45258d5f..09a0884ea 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -109,7 +109,12 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ if get_aggregation_bucket_doc_count(target_bucket) is not None: target_bucket["doc_count"] += source_bucket_item_count merged_item_count += source_bucket_item_count - continue + else: + target["buckets"].append(source_bucket) + if isinstance(target.get("doc_count"), int): + target["doc_count"] += source_bucket_item_count + else: + target["doc_count"] = source_bucket_item_count return merged_item_count, target if copy is True: @@ -149,10 +154,12 @@ def get_nested_aggregations(data: dict) -> List[dict]: results = [] if isinstance(data, dict): for key in data: - if get_aggregation_key(data[key]): + if get_aggregation_key(data[key]) and data[key]["buckets"]: results.append(data[key]) - if (not results) and data.get("buckets", list): - results.append(data) + if not results: + if ((isinstance(data.get("buckets"), list) and data["buckets"]) or + (isinstance(data.get("key"), str) and isinstance(data.get("doc_count"), int))): + results.append(data) return results def find_group_item(group_items: List[dict], value: Any) -> Optional[dict]: @@ -178,7 +185,6 @@ def normalize_results(aggregation: dict, for nested_aggregation in nested_aggregations: if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): if group_item := find_group_item(group_items, bucket_value): - # group_item["items"].extend(normalized_aggregation["items"]) for normalized_aggregation_item in normalized_aggregation["items"]: group_item["items"].append(normalized_aggregation_item) group_item["count"] += normalized_aggregation_item["count"] @@ -200,11 +206,14 @@ def normalize_results(aggregation: dict, del results["value"] return results - def sort_results(data: dict) -> None: + def sort_results(data: dict, _level: int = 0) -> None: if isinstance(data, dict) and isinstance(items := data.get("items"), list): - items.sort(key=lambda item: (-item.get("count", 0), item.get("value", ""))) + if _level == 0: # TODO: hack/parameterize + items.sort(key=lambda item: item.get("value", ""), reverse=True) + else: + items.sort(key=lambda item: (-item.get("count", 0), item.get("value", ""))) for item in items: - sort_results(item) + sort_results(item, _level=_level + 1) results = normalize_results(aggregation, additional_properties=additional_properties) if sort is True: diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a51319692..b0aeee24e 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,4 +1,5 @@ import pyramid +from copy import deepcopy from typing import List, Optional from urllib.parse import urlencode from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query @@ -144,6 +145,42 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que "group_by_donor": create_aggregations_query(aggregations_by_donor) } + if False: + aggregations_query["group_by_cell_line"]["filter"] = { + "bool": { + "must": [{ + "exists": { + "field": f"embedded.{AGGREGATION_FIELD_CELL_LINE}.raw" + } + }] + } + } + aggregations_query["group_by_donor"]["filter"] = { + "bool": { + "must": [{ + "exists": { + "field": f"embedded.{AGGREGATION_FIELD_DONOR}.raw" + } + }] + } + } + # aggregations_query["group_by_cell_line"]["aggs"] = {"date_histogram": aggregations_query["group_by_cell_line"]["aggs"]} + # aggregations_query["group_by_donor"]["aggs"] = {"date_histogram": aggregations_query["group_by_donor"]["aggs"]} + aggregations_query["group_by_cell_line"]["aggs"] = { + "date_histogram": { + "date_histogram": aggregations_query["group_by_cell_line"]["date_histogram"], + "aggs": aggregations_query["group_by_cell_line"]["aggs"] + } + } + del aggregations_query["group_by_cell_line"]["date_histogram"] + aggregations_query["group_by_donor"]["aggs"] = { + "date_histogram": { + "date_histogram": aggregations_query["group_by_donor"]["date_histogram"], + "aggs": aggregations_query["group_by_donor"]["aggs"] + } + } + del aggregations_query["group_by_donor"]["date_histogram"] + if debug_query: return {"query": query, "aggregations_query": aggregations_query} @@ -202,9 +239,31 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que if not (raw_results := raw_results.get("aggregations")): return {} + if debug: + raw_results_original = deepcopy(raw_results) + raw_results_by_cell_line = raw_results.get("group_by_cell_line") raw_results_by_donor = raw_results.get("group_by_donor") + + if False: + raw_results_by_cell_line["buckets"] = raw_results_by_cell_line["date_histogram"]["buckets"] + del raw_results_by_cell_line["date_histogram"] + raw_results_by_donor["buckets"] = raw_results_by_donor["date_histogram"]["buckets"] + del raw_results_by_donor["date_histogram"] + pass + merged_results = merge_elasticsearch_aggregation_results(raw_results_by_cell_line, raw_results_by_donor) - additional_properties = {"query": query, "aggregations_query": aggregations_query} if debug else None + additional_properties = None + if debug: + additional_properties = { + "debug": { + "query": query, + "aggregations_query": aggregations_query, + "raw_results": raw_results_original, + "raw_results_by_cell_line": deepcopy(raw_results_by_cell_line), + "raw_results_by_donor": deepcopy(raw_results_by_donor), + "merged_results": deepcopy(merged_results) + } + } return normalize_elasticsearch_aggregation_results(merged_results, sort=not nosort, additional_properties=additional_properties) From fdcca3d25b7631e15a064fc786c691a8d7cc2de5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 7 Dec 2024 09:17:56 -0500 Subject: [PATCH 07/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 75 ++++++++++++++++++++++------- src/encoded/recent_files_summary.py | 46 +++++++++++------- 2 files changed, 86 insertions(+), 35 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 09a0884ea..37bd4e2df 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -1,12 +1,15 @@ from copy import deepcopy -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple, Union + +AGGREGATION_MAX_BUCKETS = 100 +AGGREGATION_NO_VALUE = "No value" def create_elasticsearch_aggregation_query(fields: List[str], - aggregation_property_name: Optional[str] = None, - max_buckets: Optional[int] = None, - missing_value: Optional[str] = None, - create_field_aggregation: Optional[Callable] = None) -> dict: + aggregation_property_name: Optional[str] = None, + max_buckets: Optional[int] = None, + missing_value: Optional[str] = None, + create_field_aggregation: Optional[Callable] = None) -> dict: global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE @@ -122,10 +125,11 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ return merge_results(target, source)[1] -def normalize_elasticsearch_aggregation_results(aggregation: dict, - sort: bool = False, - additional_properties: Optional[dict] = None, - remove_empty_items: bool = True) -> dict: +def normalize_elasticsearch_aggregation_results( + aggregation: dict, + sort: Union[bool, str, Callable, List[Union[bool, str, Callable]]] = False, + additional_properties: Optional[dict] = None, + remove_empty_items: bool = True) -> dict: def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: # TODO: same as in merge_elasticsearch_aggregation_results function @@ -158,7 +162,7 @@ def get_nested_aggregations(data: dict) -> List[dict]: results.append(data[key]) if not results: if ((isinstance(data.get("buckets"), list) and data["buckets"]) or - (isinstance(data.get("key"), str) and isinstance(data.get("doc_count"), int))): + (isinstance(data.get("key"), str) and isinstance(data.get("doc_count"), int))): # noqa results.append(data) return results @@ -172,9 +176,12 @@ def find_group_item(group_items: List[dict], value: Any) -> Optional[dict]: def normalize_results(aggregation: dict, key: Optional[str] = None, value: Optional[str] = None, additional_properties: Optional[dict] = None) -> dict: + nonlocal remove_empty_items + if not (aggregation_key := get_aggregation_key(aggregation)): return {} + group_items = [] ; item_count = 0 # noqa for bucket in aggregation["buckets"]: if (((bucket_value := get_aggregation_bucket_value(bucket)) is None) or @@ -195,27 +202,59 @@ def normalize_results(aggregation: dict, if (remove_empty_items is False) or (bucket_item_count > 0): group_item = {"name": aggregation_key, "value": bucket_value, "count": bucket_item_count} group_items.append(group_item) + if (remove_empty_items is not False) and (not group_items): return {} results = {"name": key, "value": value, "count": item_count, "items": group_items} + if isinstance(additional_properties, dict) and additional_properties: results = {**additional_properties, **results} + if key is None: del results["name"] if value is None: del results["value"] + return results - def sort_results(data: dict, _level: int = 0) -> None: - if isinstance(data, dict) and isinstance(items := data.get("items"), list): - if _level == 0: # TODO: hack/parameterize - items.sort(key=lambda item: item.get("value", ""), reverse=True) + def sort_results(data: dict) -> None: + + nonlocal sort + + def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: + sort_function_default = lambda item: (-item.get("count", 0), item.get("value", "")) # noqa + if (sort is True) or (isinstance(sort, str) and (sort.strip().lower() == "default")): + items.sort(key=sort_function_default) + elif isinstance(sort, str) and (sort := sort.strip().lower()): + if sort.startswith("-"): + sort_reverse = True + sort = sort[1:] + else: + sort_reverse = False + if (sort in ["default"]): + items.sort(key=sort_function_default, reverse=sort_reverse) + elif (sort in ["key", "value"]): + items.sort(key=lambda item: item.get("value", ""), reverse=sort_reverse) + elif callable(sort): + items.sort(key=lambda item: sort(item)) + + def sort_results_nested(data: dict, level: int = 0) -> None: + nonlocal sort + if isinstance(sort, list) and sort: + if level < len(sort): + sort_level = sort[level] + else: + sort_level = sort[len(sort) - 1] else: - items.sort(key=lambda item: (-item.get("count", 0), item.get("value", ""))) - for item in items: - sort_results(item, _level=_level + 1) + sort_level = sort + if isinstance(data, dict) and isinstance(items := data.get("items"), list): + sort_items(items, sort=sort_level) + for item in items: + sort_results_nested(item, level=level + 1) + + sort_results_nested(data) results = normalize_results(aggregation, additional_properties=additional_properties) - if sort is True: + if sort: sort_results(results) return results diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index b0aeee24e..81af07fa1 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -52,6 +52,8 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: released can be queried for using one or more status query arguments, e.g. status=uploaded. """ + hack_filter_date_histogram = True + date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) nosort = request_arg_bool(request, "nosort") @@ -145,7 +147,11 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que "group_by_donor": create_aggregations_query(aggregations_by_donor) } - if False: + if hack_filter_date_histogram: + # TODO + # Late-breaking hack with addition of per-aggregation filter to disregard items not part + # of a group; when using the date_histogram # grouping specifier must be elevated to an + # actual additional aggregation grouping. Also see below (hack_filter_date_histrgram). aggregations_query["group_by_cell_line"]["filter"] = { "bool": { "must": [{ @@ -164,8 +170,6 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que }] } } - # aggregations_query["group_by_cell_line"]["aggs"] = {"date_histogram": aggregations_query["group_by_cell_line"]["aggs"]} - # aggregations_query["group_by_donor"]["aggs"] = {"date_histogram": aggregations_query["group_by_donor"]["aggs"]} aggregations_query["group_by_cell_line"]["aggs"] = { "date_histogram": { "date_histogram": aggregations_query["group_by_cell_line"]["date_histogram"], @@ -226,9 +230,7 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que if raw: # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. - if debug: - raw_results = {"query": query, "aggregations_query": aggregations_query, "raw_results": raw_results} - elif "@id" in raw_results: + if "@id" in raw_results: # Unless we do this we get redirect to the URL in this field, for example # to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control # &file_status_tracking.released.from=2024-09-30 @@ -239,31 +241,41 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que if not (raw_results := raw_results.get("aggregations")): return {} - if debug: - raw_results_original = deepcopy(raw_results) - raw_results_by_cell_line = raw_results.get("group_by_cell_line") raw_results_by_donor = raw_results.get("group_by_donor") - if False: + if hack_filter_date_histogram: + if debug: + raw_results = deepcopy(raw_results) # otherwise overwritten by below raw_results_by_cell_line["buckets"] = raw_results_by_cell_line["date_histogram"]["buckets"] del raw_results_by_cell_line["date_histogram"] raw_results_by_donor["buckets"] = raw_results_by_donor["date_histogram"]["buckets"] del raw_results_by_donor["date_histogram"] - pass merged_results = merge_elasticsearch_aggregation_results(raw_results_by_cell_line, raw_results_by_donor) - additional_properties = None + if debug: additional_properties = { "debug": { "query": query, "aggregations_query": aggregations_query, - "raw_results": raw_results_original, - "raw_results_by_cell_line": deepcopy(raw_results_by_cell_line), - "raw_results_by_donor": deepcopy(raw_results_by_donor), + "raw_results": raw_results, "merged_results": deepcopy(merged_results) } } - return normalize_elasticsearch_aggregation_results(merged_results, sort=not nosort, - additional_properties=additional_properties) + else: + additional_properties = None + + if nosort is not True: + # We can sort on the aggregations by level; outermost/left to innermost/right. + # In our case the outermost is the date aggregation so sort taht by the key value, + # e.g. 2014-12, descending; and the rest of the inner levels by the default + # sorting which is by aggregation count descending and secondarily by the key value. + sort = ["-key", "default"] + else: + sort = False + + normalized_results = normalize_elasticsearch_aggregation_results(merged_results, + sort=sort, + additional_properties=additional_properties) + return normalized_results From 300ae14f2b20982c695c833f25e025bfb81bfb87 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 7 Dec 2024 15:53:54 -0500 Subject: [PATCH 08/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 138 ++++++++++++++++++---------- src/encoded/recent_files_summary.py | 89 +++++------------- 2 files changed, 114 insertions(+), 113 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 37bd4e2df..9b00818e7 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -6,14 +6,16 @@ def create_elasticsearch_aggregation_query(fields: List[str], - aggregation_property_name: Optional[str] = None, + property_name: Optional[str] = None, max_buckets: Optional[int] = None, missing_value: Optional[str] = None, - create_field_aggregation: Optional[Callable] = None) -> dict: + include_missing: bool = False, + create_field_aggregation: Optional[Callable] = None, + _toplevel: bool = True) -> dict: global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE - if not (isinstance(fields, list) and fields and isinstance(field := fields[0], str) and field): + if not (isinstance(fields, list) and fields and isinstance(field := fields[0], str) and (field := field.strip())): return {} if not isinstance(missing_value, str): missing_value = AGGREGATION_NO_VALUE @@ -30,20 +32,60 @@ def create_elasticsearch_aggregation_query(fields: List[str], } } - if not (isinstance(aggregation_property_name, str) and aggregation_property_name): - aggregation_property_name = field - aggregation = {aggregation_property_name: field_aggregation} - aggregation[aggregation_property_name]["meta"] = {"field_name": field} + if not (isinstance(property_name, str) and (property_name := property_name.strip())): + property_name = field + + aggregation = {property_name: {"meta": {"field_name": field}}} + + if (include_missing is not True) and (_toplevel is True): + # Filtering out items which are not in any of the aggregations; this introduces complication if + # using date_histogram rather than simple terms, which we need add another level of aggregation + # just for the date_histogram; then the caller will need deal with (remove) it later. + extra_nesting_for_date_histogram_and_filter = "date_histogram" in field_aggregation + for field in fields: + if isinstance(field, str) and (field := field.strip()): + if not aggregation[property_name].get("filter"): + aggregation[property_name]["filter"] = {"bool": {"must": []}} + aggregation[property_name]["filter"]["bool"]["must"].append({ + "exists": { + "field": f"embedded.{field}.raw" + } + }) + else: + extra_nesting_for_date_histogram_and_filter = False + + if not extra_nesting_for_date_histogram_and_filter: + aggregation[property_name].update(field_aggregation) if nested_aggregation := create_elasticsearch_aggregation_query( fields[1:], max_buckets=max_buckets, missing_value=missing_value, - create_field_aggregation=create_field_aggregation): - aggregation[aggregation_property_name]["aggs"] = nested_aggregation - + create_field_aggregation=create_field_aggregation, _toplevel=False): + if extra_nesting_for_date_histogram_and_filter: + aggregation[property_name]["aggs"] = {"dummy_date_histogram": {**field_aggregation, "aggs": nested_aggregation}} + else: + aggregation[property_name]["aggs"] = nested_aggregation return aggregation +def prune_elasticsearch_aggregation_results(results: dict) -> None: + """ + This removes any extra level(s) of aggregation that may have been introduces in + the create_elasticsearch_aggregation_query function (above), for when/if both + a filter and a date_histogram are used together. + """ + if isinstance(results, dict): + for key in list(results.keys()): + if (key == "dummy_date_histogram") and isinstance(buckets := results[key].get("buckets"), list): + results["buckets"] = buckets + del results[key] + else: + prune_elasticsearch_aggregation_results(results[key]) + elif isinstance(results, list): + for element in results: + prune_elasticsearch_aggregation_results(element) + + def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bool = False) -> Optional[dict]: def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: @@ -217,44 +259,44 @@ def normalize_results(aggregation: dict, return results - def sort_results(data: dict) -> None: - - nonlocal sort - - def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: - sort_function_default = lambda item: (-item.get("count", 0), item.get("value", "")) # noqa - if (sort is True) or (isinstance(sort, str) and (sort.strip().lower() == "default")): - items.sort(key=sort_function_default) - elif isinstance(sort, str) and (sort := sort.strip().lower()): - if sort.startswith("-"): - sort_reverse = True - sort = sort[1:] - else: - sort_reverse = False - if (sort in ["default"]): - items.sort(key=sort_function_default, reverse=sort_reverse) - elif (sort in ["key", "value"]): - items.sort(key=lambda item: item.get("value", ""), reverse=sort_reverse) - elif callable(sort): - items.sort(key=lambda item: sort(item)) - - def sort_results_nested(data: dict, level: int = 0) -> None: - nonlocal sort - if isinstance(sort, list) and sort: - if level < len(sort): - sort_level = sort[level] - else: - sort_level = sort[len(sort) - 1] - else: - sort_level = sort - if isinstance(data, dict) and isinstance(items := data.get("items"), list): - sort_items(items, sort=sort_level) - for item in items: - sort_results_nested(item, level=level + 1) - - sort_results_nested(data) - results = normalize_results(aggregation, additional_properties=additional_properties) if sort: - sort_results(results) + sort_elasticsearch_aggregation_results(results) return results + + +def sort_elasticsearch_aggregation_results(data: dict, sort: Union[bool, str, Callable, + List[Union[bool, str, Callable]]] = False) -> None: + + def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: + sort_function_default = lambda item: (-item.get("count", 0), item.get("value", "")) # noqa + if (sort is True) or (isinstance(sort, str) and (sort.strip().lower() == "default")): + items.sort(key=sort_function_default) + elif isinstance(sort, str) and (sort := sort.strip().lower()): + if sort.startswith("-"): + sort_reverse = True + sort = sort[1:] + else: + sort_reverse = False + if (sort in ["default"]): + items.sort(key=sort_function_default, reverse=sort_reverse) + elif (sort in ["key", "value"]): + items.sort(key=lambda item: item.get("value", ""), reverse=sort_reverse) + elif callable(sort): + items.sort(key=lambda item: sort(item)) + + def sort_results(data: dict, level: int = 0) -> None: + nonlocal sort + if isinstance(sort, list) and sort: + if level < len(sort): + sort_level = sort[level] + else: + sort_level = sort[len(sort) - 1] + else: + sort_level = sort + if isinstance(data, dict) and isinstance(items := data.get("items"), list): + sort_items(items, sort=sort_level) + for item in items: + sort_results(item, level=level + 1) + + sort_results(data) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 81af07fa1..17bb3b2ea 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -5,6 +5,8 @@ from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results +from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results +from encoded.elasticsearch_utils import sort_elasticsearch_aggregation_results from encoded.endpoint_utils import parse_date_range_related_arguments from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search @@ -24,7 +26,6 @@ AGGREGATION_MAX_BUCKETS = 100 AGGREGATION_NO_VALUE = "No value" - def recent_files_summary(request: pyramid.request.Request) -> dict: """ This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, @@ -52,10 +53,9 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: released can be queried for using one or more status query arguments, e.g. status=uploaded. """ - hack_filter_date_histogram = True - date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) + include_missing = request_arg_bool(request, "novalues", request_arg_bool(request, "include_missing")) nosort = request_arg_bool(request, "nosort") debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") @@ -95,7 +95,7 @@ def create_query(request: pyramid.request.Request) -> str: def create_aggregations_query(aggregation_fields: List[str]) -> dict: global AGGREGATION_NO_VALUE - nonlocal date_property_name, max_buckets + nonlocal date_property_name, max_buckets, include_missing aggregations = [] if not isinstance(aggregation_fields, list): aggregation_fields = [aggregation_fields] @@ -120,6 +120,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa aggregations, max_buckets=max_buckets, missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, create_field_aggregation=create_field_aggregation) return aggregation_query[date_property_name] @@ -142,49 +143,14 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que AGGREGATION_FIELD_FILE_DESCRIPTOR ] + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_donor_property_name = "aggregate_by_donor" + aggregations_query = { - "group_by_cell_line": create_aggregations_query(aggregations_by_cell_line), - "group_by_donor": create_aggregations_query(aggregations_by_donor) + aggregate_by_cell_line_property_name: create_aggregations_query(aggregations_by_cell_line), + aggregate_by_donor_property_name: create_aggregations_query(aggregations_by_donor) } - if hack_filter_date_histogram: - # TODO - # Late-breaking hack with addition of per-aggregation filter to disregard items not part - # of a group; when using the date_histogram # grouping specifier must be elevated to an - # actual additional aggregation grouping. Also see below (hack_filter_date_histrgram). - aggregations_query["group_by_cell_line"]["filter"] = { - "bool": { - "must": [{ - "exists": { - "field": f"embedded.{AGGREGATION_FIELD_CELL_LINE}.raw" - } - }] - } - } - aggregations_query["group_by_donor"]["filter"] = { - "bool": { - "must": [{ - "exists": { - "field": f"embedded.{AGGREGATION_FIELD_DONOR}.raw" - } - }] - } - } - aggregations_query["group_by_cell_line"]["aggs"] = { - "date_histogram": { - "date_histogram": aggregations_query["group_by_cell_line"]["date_histogram"], - "aggs": aggregations_query["group_by_cell_line"]["aggs"] - } - } - del aggregations_query["group_by_cell_line"]["date_histogram"] - aggregations_query["group_by_donor"]["aggs"] = { - "date_histogram": { - "date_histogram": aggregations_query["group_by_donor"]["date_histogram"], - "aggs": aggregations_query["group_by_donor"]["aggs"] - } - } - del aggregations_query["group_by_donor"]["date_histogram"] - if debug_query: return {"query": query, "aggregations_query": aggregations_query} @@ -230,29 +196,24 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que if raw: # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. + # And note that unless we remove teh @id property we get redirected to the URL in this field, + # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control + # &file_status_tracking.released.from=2024-09-30 + # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' if "@id" in raw_results: - # Unless we do this we get redirect to the URL in this field, for example - # to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control - # &file_status_tracking.released.from=2024-09-30 - # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' del raw_results["@id"] return raw_results if not (raw_results := raw_results.get("aggregations")): return {} - raw_results_by_cell_line = raw_results.get("group_by_cell_line") - raw_results_by_donor = raw_results.get("group_by_donor") - - if hack_filter_date_histogram: - if debug: - raw_results = deepcopy(raw_results) # otherwise overwritten by below - raw_results_by_cell_line["buckets"] = raw_results_by_cell_line["date_histogram"]["buckets"] - del raw_results_by_cell_line["date_histogram"] - raw_results_by_donor["buckets"] = raw_results_by_donor["date_histogram"]["buckets"] - del raw_results_by_donor["date_histogram"] + if debug: + raw_results = deepcopy(raw_results) # otherwise may be overwritten by below - merged_results = merge_elasticsearch_aggregation_results(raw_results_by_cell_line, raw_results_by_donor) + prune_elasticsearch_aggregation_results(raw_results) + merged_results = merge_elasticsearch_aggregation_results( + raw_results.get(aggregate_by_cell_line_property_name), + raw_results.get(aggregate_by_donor_property_name)) if debug: additional_properties = { @@ -266,16 +227,14 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que else: additional_properties = None + normalized_results = normalize_elasticsearch_aggregation_results( + merged_results, additional_properties=additional_properties) + if nosort is not True: # We can sort on the aggregations by level; outermost/left to innermost/right. # In our case the outermost is the date aggregation so sort taht by the key value, # e.g. 2014-12, descending; and the rest of the inner levels by the default # sorting which is by aggregation count descending and secondarily by the key value. - sort = ["-key", "default"] - else: - sort = False + sort_elasticsearch_aggregation_results(normalized_results, ["-key", "default"]) - normalized_results = normalize_elasticsearch_aggregation_results(merged_results, - sort=sort, - additional_properties=additional_properties) return normalized_results From 0650d8d627a8fde0e4d2ab0f09df3f9fe1ea4835 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 7 Dec 2024 16:00:01 -0500 Subject: [PATCH 09/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 9 ++------- src/encoded/recent_files_summary.py | 10 +++++----- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 9b00818e7..b7a32ab9d 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -167,11 +167,8 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ return merge_results(target, source)[1] -def normalize_elasticsearch_aggregation_results( - aggregation: dict, - sort: Union[bool, str, Callable, List[Union[bool, str, Callable]]] = False, - additional_properties: Optional[dict] = None, - remove_empty_items: bool = True) -> dict: +def normalize_elasticsearch_aggregation_results(aggregation: dict, additional_properties: Optional[dict] = None, + remove_empty_items: bool = True) -> dict: def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: # TODO: same as in merge_elasticsearch_aggregation_results function @@ -260,8 +257,6 @@ def normalize_results(aggregation: dict, return results results = normalize_results(aggregation, additional_properties=additional_properties) - if sort: - sort_elasticsearch_aggregation_results(results) return results diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 17bb3b2ea..a21515470 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -55,7 +55,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) - include_missing = request_arg_bool(request, "novalues", request_arg_bool(request, "include_missing")) + include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "inovalues")) nosort = request_arg_bool(request, "nosort") debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") @@ -63,7 +63,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: def create_query(request: pyramid.request.Request) -> str: - global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES + global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES, QUERY_RECENT_MONTHS nonlocal date_property_name types = request_args(request, "type", QUERY_FILE_TYPES) @@ -93,7 +93,7 @@ def create_query(request: pyramid.request.Request) -> str: query_string = query_string.replace("=%21", "%21=") return f"/search/?{query_string}" - def create_aggregations_query(aggregation_fields: List[str]) -> dict: + def create_aggregation_query(aggregation_fields: List[str]) -> dict: global AGGREGATION_NO_VALUE nonlocal date_property_name, max_buckets, include_missing aggregations = [] @@ -147,8 +147,8 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que aggregate_by_donor_property_name = "aggregate_by_donor" aggregations_query = { - aggregate_by_cell_line_property_name: create_aggregations_query(aggregations_by_cell_line), - aggregate_by_donor_property_name: create_aggregations_query(aggregations_by_donor) + aggregate_by_cell_line_property_name: create_aggregation_query(aggregations_by_cell_line), + aggregate_by_donor_property_name: create_aggregation_query(aggregations_by_donor) } if debug_query: From f598c3fb2671f9eb6ce92bbeec4f35c014aa02df Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 7 Dec 2024 16:08:18 -0500 Subject: [PATCH 10/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 9 +++++++-- src/encoded/recent_files_summary.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index b7a32ab9d..adc446c2b 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -260,9 +260,14 @@ def normalize_results(aggregation: dict, return results -def sort_elasticsearch_aggregation_results(data: dict, sort: Union[bool, str, Callable, +def sort_normalized_aggregation_results(data: dict, sort: Union[bool, str, Callable, List[Union[bool, str, Callable]]] = False) -> None: + """ + Sorts the given *normalized* (see above) ElasticSearch aggregation results. + By default, this is by item (doc) count descending and secondarily by key value. + """ + def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: sort_function_default = lambda item: (-item.get("count", 0), item.get("value", "")) # noqa if (sort is True) or (isinstance(sort, str) and (sort.strip().lower() == "default")): @@ -273,7 +278,7 @@ def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: sort = sort[1:] else: sort_reverse = False - if (sort in ["default"]): + if sort == "default": items.sort(key=sort_function_default, reverse=sort_reverse) elif (sort in ["key", "value"]): items.sort(key=lambda item: item.get("value", ""), reverse=sort_reverse) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a21515470..551001bcf 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -6,7 +6,7 @@ from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results -from encoded.elasticsearch_utils import sort_elasticsearch_aggregation_results +from encoded.elasticsearch_utils import sort_normalized_aggregation_results from encoded.endpoint_utils import parse_date_range_related_arguments from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search @@ -124,9 +124,9 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa create_field_aggregation=create_field_aggregation) return aggregation_query[date_property_name] - def execute_query(request: pyramid.request.Request, query: str, aggregations_query: dict) -> str: + def execute_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: request = snovault_make_search_subreq(request, path=query, method="GET") - results = snovault_search(None, request, custom_aggregations=aggregations_query) + results = snovault_search(None, request, custom_aggregations=aggregation_query) return results query = create_query(request) @@ -146,15 +146,15 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que aggregate_by_cell_line_property_name = "aggregate_by_cell_line" aggregate_by_donor_property_name = "aggregate_by_donor" - aggregations_query = { + aggregation_query = { aggregate_by_cell_line_property_name: create_aggregation_query(aggregations_by_cell_line), aggregate_by_donor_property_name: create_aggregation_query(aggregations_by_donor) } if debug_query: - return {"query": query, "aggregations_query": aggregations_query} + return {"query": query, "aggregation_query": aggregation_query} - raw_results = execute_query(request, query, aggregations_query) + raw_results = execute_query(request, query, aggregation_query) # Note that the doc_count values returned by ElasticSearch do actually seem to be for unique items, # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 @@ -219,7 +219,7 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que additional_properties = { "debug": { "query": query, - "aggregations_query": aggregations_query, + "aggregation_query": aggregation_query, "raw_results": raw_results, "merged_results": deepcopy(merged_results) } @@ -235,6 +235,6 @@ def execute_query(request: pyramid.request.Request, query: str, aggregations_que # In our case the outermost is the date aggregation so sort taht by the key value, # e.g. 2014-12, descending; and the rest of the inner levels by the default # sorting which is by aggregation count descending and secondarily by the key value. - sort_elasticsearch_aggregation_results(normalized_results, ["-key", "default"]) + sort_normalized_aggregation_results(normalized_results, ["-key", "default"]) return normalized_results From b30dd873af929c392b587c6f07b674af8f604f35 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 00:47:48 -0500 Subject: [PATCH 11/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 178 +++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index adc446c2b..28825bf78 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -13,8 +13,59 @@ def create_elasticsearch_aggregation_query(fields: List[str], create_field_aggregation: Optional[Callable] = None, _toplevel: bool = True) -> dict: + """ + Returns a dictionary representing an ElasticSearch aggregation query for the field names. + If more than one is given the the aggregation will be nested, one within another, for example, + given ["date_created", "donors.display_title", "release_tracker_description"] we my return + something like this: + + { + "aggregate_by_donor": { + "meta": { "field_name": "date_created" }, + "filter": { + "bool": { + "must": [ + {"exists": {"field": "embedded.date_created.raw"}}, + {"exists": {"field": "embedded.donors.display_title.raw"}}, + {"exists": {"field": "embedded.release_tracker_description.raw"}} + ] + } + }, + "aggs": { + "dummy_date_histogram": { + "date_histogram": { + "field": "embedded.date_created", + "calendar_interval": "month", + "format": "yyyy-MM", "missing": "1970-01", + "order": { "_key": "desc"} + }, + "aggs": { + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "terms": { + "field": "embedded.donors.display_title.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } + } + } + """ global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE + if isinstance(fields, str): + fields = [fields] if not (isinstance(fields, list) and fields and isinstance(field := fields[0], str) and (field := field.strip())): return {} if not isinstance(missing_value, str): @@ -87,6 +138,95 @@ def prune_elasticsearch_aggregation_results(results: dict) -> None: def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bool = False) -> Optional[dict]: + """ + Merges the given second (source) argument into the given first (target) argument (in palce), recursively, both + of which are assumed to be ElasticSearch aggregation query results; doc_coiunt values are updated as expected. + If the given copy argument is True then then the merge is not done to the given target in-place, rather a copy + of it is made and the merge done to it. In eiter case the resultant merged target is returned. For example: + + target = { + "meta": { "field_name": "date_created" }, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 13, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": { "field_name": "release_tracker_description" }, + "buckets": [ + { "key": "WGS ONT PromethION 24 bam", "doc_count": 1 } + ] + } + } + ] + } + } + ] + } + + source = { + "meta": { "field_name": "date_created" }, "doc_count": 16, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 14, + "donors.display_title": { + "meta": { "field_name": "donors.display_title" }, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": { "field_name": "release_tracker_description" }, + "buckets": [ + { "key": "Fiber-seq PacBio Revio bam", "doc_count": 4 } + ] + } + } + ] + } + } + ] + } + + merge_elasticsearch_aggregation_results(target, source) == { + "meta": { "field_name": "date_created" }, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": { "field_name": "release_tracker_description" }, + "buckets": [ + { "key": "WGS ONT PromethION 24 bam", "doc_count": 1 } + ] + } + } + ] + }, + "donors.display_title": { + "meta": { "field_name": "donors.display_title" }, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": { "field_name": "release_tracker_description" }, + "buckets": [ + { "key": "Fiber-seq PacBio Revio bam", "doc_count": 4 } + ] + } + } + ] + } + } + ] + } + """ def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): @@ -170,6 +310,44 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ def normalize_elasticsearch_aggregation_results(aggregation: dict, additional_properties: Optional[dict] = None, remove_empty_items: bool = True) -> dict: + """ + Normalizes the given result of an ElasticSearch aggregation query into a more readable/consumable format. + For example, given the result of the the example for merge_elasticsearch_aggregation_results above as input, + this function would return something like this: + + normalize_elasticsearch_aggregation_results(aggregation_results) == { + "count": 25, + "items": [ + { + "name": "date_created", + "value": "2024-12", "count": 11, + "items": [ + { + "name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code", + "value": "COLO829T", "count": 1, + "items": [ + { + "name": "release_tracker_description", + "value": "WGS ONT PromethION 24 bam", "count": 1 + } + ] + }, + { + "name": "donors.display_title", + "value": "DAC_DONOR_COLO829", "count": 4, + "items": [ + { + "name": "release_tracker_description", + "value": "Fiber-seq PacBio Revio bam", "count": 4 + } + ] + } + ] + } + ] + } + """ + def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: # TODO: same as in merge_elasticsearch_aggregation_results function if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): From 6da89610ed7d7fcb27bfee0468131127a840897d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 00:56:48 -0500 Subject: [PATCH 12/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 28825bf78..282c95a01 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -16,7 +16,7 @@ def create_elasticsearch_aggregation_query(fields: List[str], """ Returns a dictionary representing an ElasticSearch aggregation query for the field names. If more than one is given the the aggregation will be nested, one within another, for example, - given ["date_created", "donors.display_title", "release_tracker_description"] we my return + given ["date_created", "donors.display_title", "release_tracker_description"] we would return something like this: { @@ -61,6 +61,25 @@ def create_elasticsearch_aggregation_query(fields: List[str], } } } + + The above example assumes that a create_field_aggregation function callable was passed as an argument + and that if/when its argument is date_created then it would have returned something like this + + { + "date_histogram": { + "field": f"embedded.date_created", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + + And further, that the include_missing was the (default) of False, in whice case items which were not part of any + of the aggregation fields specified, would be filtered out. This demonstrates a slight complication dealt with + in this particular case where an extra level of aggregation needs to be introducts (dummy_date_histogram). + This extra bit of cruft necessary to get the ElasticSearch query to work as expected, manifests itself in the + query result as well and is dispensed with using the prune_elasticsearch_aggregation_results function below. """ global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE @@ -121,8 +140,8 @@ def create_elasticsearch_aggregation_query(fields: List[str], def prune_elasticsearch_aggregation_results(results: dict) -> None: """ - This removes any extra level(s) of aggregation that may have been introduces in - the create_elasticsearch_aggregation_query function (above), for when/if both + This removes any extra level(s) of aggregation (i.e. dummy_date_histogram) that may have been + introduced in the create_elasticsearch_aggregation_query function (above), for when/if both a filter and a date_histogram are used together. """ if isinstance(results, dict): From c5078356cfb12977a75bcd795c1b92be3f68a268 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 12:40:16 -0500 Subject: [PATCH 13/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 13 +- src/encoded/tests/test_elasticsearch_utils.py | 122 ++++++++++++++++++ 2 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 src/encoded/tests/test_elasticsearch_utils.py diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 282c95a01..1bb64481b 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -75,11 +75,11 @@ def create_elasticsearch_aggregation_query(fields: List[str], } } - And further, that the include_missing was the (default) of False, in whice case items which were not part of any - of the aggregation fields specified, would be filtered out. This demonstrates a slight complication dealt with - in this particular case where an extra level of aggregation needs to be introducts (dummy_date_histogram). - This extra bit of cruft necessary to get the ElasticSearch query to work as expected, manifests itself in the - query result as well and is dispensed with using the prune_elasticsearch_aggregation_results function below. + It further assumes, that the include_missing argument is False (default), in which case items not part of + any of the specified aggregation fields would be filtered out. This demonstrates a slight complication with + this particular case where an extra level of aggregation needs to be introducts (dummy_date_histogram). + This extra bit of cruft, necessary to get the ElasticSearch query to work as expected, manifests itself in + the query result as well and is dispensed with using the prune_elasticsearch_aggregation_results function below. """ global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE @@ -132,7 +132,8 @@ def create_elasticsearch_aggregation_query(fields: List[str], missing_value=missing_value, create_field_aggregation=create_field_aggregation, _toplevel=False): if extra_nesting_for_date_histogram_and_filter: - aggregation[property_name]["aggs"] = {"dummy_date_histogram": {**field_aggregation, "aggs": nested_aggregation}} + aggregation[property_name]["aggs"] = \ + {"dummy_date_histogram": {**field_aggregation, "aggs": nested_aggregation}} else: aggregation[property_name]["aggs"] = nested_aggregation return aggregation diff --git a/src/encoded/tests/test_elasticsearch_utils.py b/src/encoded/tests/test_elasticsearch_utils.py new file mode 100644 index 000000000..7ed6ac11e --- /dev/null +++ b/src/encoded/tests/test_elasticsearch_utils.py @@ -0,0 +1,122 @@ +from hms_utils.misc_utils import dj +import pytest +from typing import Optional +from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query +from encoded.recent_files_summary import (AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR) + +def test_create_elasticsearch_aggregation_query_a(): + + def create_field_aggregation(field: str) -> Optional[dict]: + if field == AGGREGATION_FIELD_RELEASE_DATE: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + } + } + + aggregations = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, create_field_aggregation=create_field_aggregation) + + assert aggregation_query == { + "file_status_tracking.released": { + "meta": {"field_name": "file_status_tracking.released"}, + "filter": { + "bool": { + "must": [ + {"exists": {"field": "embedded.file_status_tracking.released.raw"}}, + {"exists": {"field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw"}}, + {"exists": {"field": "embedded.release_tracker_description.raw"}} + ] + } + }, + "aggs": { + "dummy_date_histogram": { + "date_histogram": { + "field": "embedded.file_status_tracking.released", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": { "_key": "desc" } + }, + "aggs": { + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "terms": { + "field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": { "field_name": "release_tracker_description" }, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } + } + } + + +def test_create_elasticsearch_aggregation_query_b(): + + def create_field_aggregation(field: str) -> Optional[dict]: + if field == AGGREGATION_FIELD_RELEASE_DATE: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + } + } + + aggregations = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + # Same as previous tests but with include_missing=True (no date_histogram complication). + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, create_field_aggregation=create_field_aggregation, include_missing=True) + + aggregation_query == { + "file_status_tracking.released": { + "meta": {"field_name": "file_status_tracking.released"}, + "date_histogram": { + "field": "embedded.file_status_tracking.released", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + }, + "aggs": { + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "terms": { + "field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } From b36eb83fb0f8da5270665859b89aa5ef45c48967 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 12:51:41 -0500 Subject: [PATCH 14/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 30 +-- src/encoded/tests/test_elasticsearch_utils.py | 171 +++++++++++++++++- 2 files changed, 180 insertions(+), 21 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 1bb64481b..49d158484 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -165,19 +165,19 @@ def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bo of it is made and the merge done to it. In eiter case the resultant merged target is returned. For example: target = { - "meta": { "field_name": "date_created" }, "doc_count": 15, + "meta": {"field_name": "date_created"}, "doc_count": 15, "buckets": [ { "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 13, "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { - "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, "buckets": [ { "key": "COLO829T", "doc_count": 7, "release_tracker_description": { - "meta": { "field_name": "release_tracker_description" }, + "meta": {"field_name": "release_tracker_description"}, "buckets": [ - { "key": "WGS ONT PromethION 24 bam", "doc_count": 1 } + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} ] } } @@ -188,19 +188,19 @@ def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bo } source = { - "meta": { "field_name": "date_created" }, "doc_count": 16, + "meta": {"field_name": "date_created"}, "doc_count": 16, "buckets": [ { "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 14, "donors.display_title": { - "meta": { "field_name": "donors.display_title" }, + "meta": {"field_name": "donors.display_title"}, "buckets": [ { "key": "DAC_DONOR_COLO829", "doc_count": 12, "release_tracker_description": { - "meta": { "field_name": "release_tracker_description" }, + "meta": {"field_name": "release_tracker_description"}, "buckets": [ - { "key": "Fiber-seq PacBio Revio bam", "doc_count": 4 } + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} ] } } @@ -211,33 +211,33 @@ def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bo } merge_elasticsearch_aggregation_results(target, source) == { - "meta": { "field_name": "date_created" }, "doc_count": 15, + "meta": {"field_name": "date_created"}, "doc_count": 15, "buckets": [ { "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { - "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, "buckets": [ { "key": "COLO829T", "doc_count": 7, "release_tracker_description": { - "meta": { "field_name": "release_tracker_description" }, + "meta": {"field_name": "release_tracker_description"}, "buckets": [ - { "key": "WGS ONT PromethION 24 bam", "doc_count": 1 } + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} ] } } ] }, "donors.display_title": { - "meta": { "field_name": "donors.display_title" }, + "meta": {"field_name": "donors.display_title"}, "buckets": [ { "key": "DAC_DONOR_COLO829", "doc_count": 12, "release_tracker_description": { - "meta": { "field_name": "release_tracker_description" }, + "meta": {"field_name": "release_tracker_description"}, "buckets": [ - { "key": "Fiber-seq PacBio Revio bam", "doc_count": 4 } + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} ] } } diff --git a/src/encoded/tests/test_elasticsearch_utils.py b/src/encoded/tests/test_elasticsearch_utils.py index 7ed6ac11e..d44fdc089 100644 --- a/src/encoded/tests/test_elasticsearch_utils.py +++ b/src/encoded/tests/test_elasticsearch_utils.py @@ -1,10 +1,11 @@ -from hms_utils.misc_utils import dj import pytest from typing import Optional from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query -from encoded.recent_files_summary import (AGGREGATION_FIELD_RELEASE_DATE, - AGGREGATION_FIELD_CELL_LINE, - AGGREGATION_FIELD_FILE_DESCRIPTOR) +from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results +from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results +from encoded.recent_files_summary import AGGREGATION_FIELD_RELEASE_DATE +from encoded.recent_files_summary import AGGREGATION_FIELD_CELL_LINE +from encoded.recent_files_summary import AGGREGATION_FIELD_FILE_DESCRIPTOR def test_create_elasticsearch_aggregation_query_a(): @@ -55,7 +56,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: }, "aggs": { "release_tracker_description": { - "meta": { "field_name": "release_tracker_description" }, + "meta": {"field_name": "release_tracker_description"}, "terms": { "field": "embedded.release_tracker_description.raw", "missing": "No value", "size": 100 @@ -92,7 +93,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: aggregation_query = create_elasticsearch_aggregation_query( aggregations, create_field_aggregation=create_field_aggregation, include_missing=True) - aggregation_query == { + assert aggregation_query == { "file_status_tracking.released": { "meta": {"field_name": "file_status_tracking.released"}, "date_histogram": { @@ -120,3 +121,161 @@ def create_field_aggregation(field: str) -> Optional[dict]: } } } + + +def test_merge_elasticsearch_aggregation_results_a(): + + target = { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 13, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + } + } + ] + } + + source = { + "meta": {"field_name": "date_created"}, "doc_count": 16, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 14, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + + assert merge_elasticsearch_aggregation_results(target, source) == { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + }, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + + +def test_normalize_elasticsearch_aggregation_results_a(): + + results = { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + }, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + + assert normalize_elasticsearch_aggregation_results(results) == { + "count": 25, + "items": [ + { + "name": "date_created", + "value": "2024-12", "count": 11, + "items": [ + { + "name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code", + "value": "COLO829T", "count": 1, + "items": [ + { + "name": "release_tracker_description", + "value": "WGS ONT PromethION 24 bam", "count": 1 + } + ] + }, + { + "name": "donors.display_title", + "value": "DAC_DONOR_COLO829", "count": 4, + "items": [ + { + "name": "release_tracker_description", + "value": "Fiber-seq PacBio Revio bam", "count": 4 + } + ] + } + ] + } + ] + } From 911808bfc49a0fce1951f204f20f4bd9c25ab66a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 13:41:37 -0500 Subject: [PATCH 15/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 9 ++++++++- src/encoded/tests/test_elasticsearch_utils.py | 20 +++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 49d158484..b01e76231 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -324,7 +324,14 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ if copy is True: target = deepcopy(target) - return merge_results(target, source)[1] + + _, target = merge_results(target, source) + + if (((source_item_count := get_aggregation_bucket_doc_count(source)) is not None) and + (get_aggregation_bucket_doc_count(target) is not None)): # noqa + target["doc_count"] += source_item_count + + return target def normalize_elasticsearch_aggregation_results(aggregation: dict, additional_properties: Optional[dict] = None, diff --git a/src/encoded/tests/test_elasticsearch_utils.py b/src/encoded/tests/test_elasticsearch_utils.py index d44fdc089..97d690500 100644 --- a/src/encoded/tests/test_elasticsearch_utils.py +++ b/src/encoded/tests/test_elasticsearch_utils.py @@ -126,10 +126,10 @@ def create_field_aggregation(field: str) -> Optional[dict]: def test_merge_elasticsearch_aggregation_results_a(): target = { - "meta": {"field_name": "date_created"}, "doc_count": 15, + "meta": {"field_name": "date_created"}, "doc_count": 7, "buckets": [ { - "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 13, + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 7, "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, "buckets": [ @@ -138,7 +138,7 @@ def test_merge_elasticsearch_aggregation_results_a(): "release_tracker_description": { "meta": {"field_name": "release_tracker_description"}, "buckets": [ - {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + {"key": "WGS ONT PromethION 24 bam", "doc_count": 7} ] } } @@ -149,10 +149,10 @@ def test_merge_elasticsearch_aggregation_results_a(): } source = { - "meta": {"field_name": "date_created"}, "doc_count": 16, + "meta": {"field_name": "date_created"}, "doc_count": 12, "buckets": [ { - "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 14, + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 12, "donors.display_title": { "meta": {"field_name": "donors.display_title"}, "buckets": [ @@ -161,7 +161,7 @@ def test_merge_elasticsearch_aggregation_results_a(): "release_tracker_description": { "meta": {"field_name": "release_tracker_description"}, "buckets": [ - {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 12} ] } } @@ -172,10 +172,10 @@ def test_merge_elasticsearch_aggregation_results_a(): } assert merge_elasticsearch_aggregation_results(target, source) == { - "meta": {"field_name": "date_created"}, "doc_count": 15, + "meta": {"field_name": "date_created"}, "doc_count": 19, "buckets": [ { - "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 19, "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, "buckets": [ @@ -184,7 +184,7 @@ def test_merge_elasticsearch_aggregation_results_a(): "release_tracker_description": { "meta": {"field_name": "release_tracker_description"}, "buckets": [ - {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + {"key": "WGS ONT PromethION 24 bam", "doc_count": 7} ] } } @@ -198,7 +198,7 @@ def test_merge_elasticsearch_aggregation_results_a(): "release_tracker_description": { "meta": {"field_name": "release_tracker_description"}, "buckets": [ - {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 12} ] } } From 7bde707eaa310c76336398b7e99fc7308515aa87 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 15:02:47 -0500 Subject: [PATCH 16/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index b01e76231..df24a6e0f 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -1,3 +1,4 @@ +from hms_utils.misc_utils import dj from copy import deepcopy from typing import Any, Callable, List, Optional, Tuple, Union @@ -275,7 +276,7 @@ def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: return doc_count return None - def get_aggregation_buckets_doc_count(aggregation: dict): + def get_aggregation_buckets_doc_count(aggregation: dict) -> int: buckets_doc_count = 0 if get_aggregation_key(aggregation): for aggregation_bucket in aggregation["buckets"]: @@ -293,7 +294,7 @@ def find_aggregation_bucket(aggregation: dict, value: str) -> Optional[dict]: def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[int]]: merged_item_count = 0 if not ((aggregation_key := get_aggregation_key(source)) and (get_aggregation_key(target) == aggregation_key)): - return None, None + return 0, None for source_bucket in source["buckets"]: if (((source_bucket_value := get_aggregation_bucket_value(source_bucket)) is None) or ((source_bucket_item_count := get_aggregation_bucket_doc_count(source_bucket)) is None)): # noqa @@ -301,13 +302,15 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ if (target_bucket := find_aggregation_bucket(target, source_bucket_value)): if source_nested_aggregation := get_nested_aggregation(source_bucket): if target_nested_aggregation := get_nested_aggregation(target_bucket): - merged_item_count, _ = merge_results(target_nested_aggregation, source_nested_aggregation) - if merged_item_count is None: + merged_item_count, merged_results = merge_results(target_nested_aggregation, source_nested_aggregation) + if merged_results is None: if source_nested_aggregation_key := get_aggregation_key(source_nested_aggregation): - target_bucket[source_nested_aggregation_key] = \ - source_bucket[source_nested_aggregation_key] - target_bucket["doc_count"] += \ - get_aggregation_buckets_doc_count(source_bucket[source_nested_aggregation_key]) + target_bucket[source_nested_aggregation_key] = ( + source_nested_bucket := source_bucket[source_nested_aggregation_key]) + if (source_nested_bucket_item_count := + get_aggregation_buckets_doc_count(source_nested_bucket)) > 0: + target_bucket["doc_count"] += source_nested_bucket_item_count + merged_item_count += source_nested_bucket_item_count elif merged_item_count > 0: target_bucket["doc_count"] += merged_item_count elif get_aggregation_bucket_value(target_bucket) is not None: @@ -320,16 +323,15 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ target["doc_count"] += source_bucket_item_count else: target["doc_count"] = source_bucket_item_count + merged_item_count += source_bucket_item_count return merged_item_count, target if copy is True: target = deepcopy(target) - _, target = merge_results(target, source) - - if (((source_item_count := get_aggregation_bucket_doc_count(source)) is not None) and - (get_aggregation_bucket_doc_count(target) is not None)): # noqa - target["doc_count"] += source_item_count + merged_item_count, target = merge_results(target, source) + if (merged_item_count > 0) and (get_aggregation_bucket_doc_count(target) is not None): + target["doc_count"] += merged_item_count return target From e59840a929a52868c3676dee2e0ee5cb00177d25 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 15:09:15 -0500 Subject: [PATCH 17/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index df24a6e0f..ad9c5f92a 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -1,4 +1,3 @@ -from hms_utils.misc_utils import dj from copy import deepcopy from typing import Any, Callable, List, Optional, Tuple, Union @@ -276,7 +275,7 @@ def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: return doc_count return None - def get_aggregation_buckets_doc_count(aggregation: dict) -> int: + def get_aggregation_total_buckets_doc_count(aggregation: dict) -> int: buckets_doc_count = 0 if get_aggregation_key(aggregation): for aggregation_bucket in aggregation["buckets"]: @@ -308,15 +307,14 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ target_bucket[source_nested_aggregation_key] = ( source_nested_bucket := source_bucket[source_nested_aggregation_key]) if (source_nested_bucket_item_count := - get_aggregation_buckets_doc_count(source_nested_bucket)) > 0: + get_aggregation_total_buckets_doc_count(source_nested_bucket)) > 0: target_bucket["doc_count"] += source_nested_bucket_item_count merged_item_count += source_nested_bucket_item_count elif merged_item_count > 0: target_bucket["doc_count"] += merged_item_count - elif get_aggregation_bucket_value(target_bucket) is not None: - if get_aggregation_bucket_doc_count(target_bucket) is not None: - target_bucket["doc_count"] += source_bucket_item_count - merged_item_count += source_bucket_item_count + elif get_aggregation_bucket_doc_count(target_bucket) is not None: + target_bucket["doc_count"] += source_bucket_item_count + merged_item_count += source_bucket_item_count else: target["buckets"].append(source_bucket) if isinstance(target.get("doc_count"), int): From d772ec1caff7a5c6c08838fac0176ceacd4cf6a1 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 8 Dec 2024 20:33:29 -0500 Subject: [PATCH 18/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 8 +++++ src/encoded/recent_files_summary.py | 52 ++++++++++++++--------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index ad9c5f92a..b501a05f0 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -1,3 +1,4 @@ +from hms_utils.misc_utils import dj from copy import deepcopy from typing import Any, Callable, List, Optional, Tuple, Union @@ -435,6 +436,13 @@ def normalize_results(aggregation: dict, if nested_aggregations := get_nested_aggregations(bucket): for nested_aggregation in nested_aggregations: if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): + if normalized_aggregation["count"] != bucket_item_count: + # Record the original doc_count value from the raw result; + # this may be different (lesser) than the result we aggregate here + # because ElasticSearch aggregations actually are based on unique values. + # TODO: Whould we use this as the real count value though it may look wrong. + normalized_aggregation["count_original"] = bucket_item_count + # normalized_aggregation["count"] = bucket_item_count if group_item := find_group_item(group_items, bucket_value): for normalized_aggregation_item in normalized_aggregation["items"]: group_item["items"].append(normalized_aggregation_item) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 551001bcf..0d5dd7213 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -156,13 +156,34 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer raw_results = execute_query(request, query, aggregation_query) - # Note that the doc_count values returned by ElasticSearch do actually seem to be for unique items, + if raw: + # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. + # And note that unless we remove teh @id property we get redirected to the URL in this field, + # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control + # &file_status_tracking.released.from=2024-09-30 + # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' + if "@id" in raw_results: + del raw_results["@id"] + return raw_results + + if not (raw_results := raw_results.get("aggregations")): + return {} + + if debug: + raw_results = deepcopy(raw_results) # otherwise may be overwritten by below + + prune_elasticsearch_aggregation_results(raw_results) + merged_results = merge_elasticsearch_aggregation_results( + raw_results.get(aggregate_by_cell_line_property_name), + raw_results.get(aggregate_by_donor_property_name)) + + # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), - # then it its doc_count will not count it twice. This creates a situation where it might look like the counts - # are wrong in this returned merged/normalized result set where the outer item count is less than the sum of - # the individual counts withni each sub-group. For example, the below result shows a top-level doc_count of 1 - # even though there are 2 documents, 1 in the HG00438 group and the other in the HG005 it would be because + # then its doc_count will NOT be counted TWICE. This creates a situation where it might LOOK like the counts + # are WRONG in the MERGED (via returned merge_elasticsearch_aggregation_results) result set, where the outer + # item count may be than the sum of the individual counts within each sub-group. For example, the below result shows + # a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other in the HG005 it would be because # the same unique file has a cell_line.code of both HG00438 and HG005. # { # "meta": { "field_name": "file_status_tracking.released" }, @@ -194,27 +215,6 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer # ] # } - if raw: - # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. - # And note that unless we remove teh @id property we get redirected to the URL in this field, - # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control - # &file_status_tracking.released.from=2024-09-30 - # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' - if "@id" in raw_results: - del raw_results["@id"] - return raw_results - - if not (raw_results := raw_results.get("aggregations")): - return {} - - if debug: - raw_results = deepcopy(raw_results) # otherwise may be overwritten by below - - prune_elasticsearch_aggregation_results(raw_results) - merged_results = merge_elasticsearch_aggregation_results( - raw_results.get(aggregate_by_cell_line_property_name), - raw_results.get(aggregate_by_donor_property_name)) - if debug: additional_properties = { "debug": { From 8ddcd24e4649938e73919c4bd94a3b088e7ae9e9 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 9 Dec 2024 17:01:57 -0500 Subject: [PATCH 19/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 31 ++++--- src/encoded/recent_files_summary.py | 132 ++++++++++++++++++++++------ 2 files changed, 124 insertions(+), 39 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index b501a05f0..17e139d8c 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -12,6 +12,7 @@ def create_elasticsearch_aggregation_query(fields: List[str], missing_value: Optional[str] = None, include_missing: bool = False, create_field_aggregation: Optional[Callable] = None, + create_field_filter: Optional[Callable] = None, _toplevel: bool = True) -> dict: """ @@ -115,13 +116,15 @@ def create_elasticsearch_aggregation_query(fields: List[str], extra_nesting_for_date_histogram_and_filter = "date_histogram" in field_aggregation for field in fields: if isinstance(field, str) and (field := field.strip()): + if not (callable(create_field_filter) and isinstance(filter := create_field_filter(field), dict)): + filter = { + "exists": { + "field": f"embedded.{field}.raw" + } + } if not aggregation[property_name].get("filter"): aggregation[property_name]["filter"] = {"bool": {"must": []}} - aggregation[property_name]["filter"]["bool"]["must"].append({ - "exists": { - "field": f"embedded.{field}.raw" - } - }) + aggregation[property_name]["filter"]["bool"]["must"].append(filter) else: extra_nesting_for_date_histogram_and_filter = False @@ -308,7 +311,7 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ target_bucket[source_nested_aggregation_key] = ( source_nested_bucket := source_bucket[source_nested_aggregation_key]) if (source_nested_bucket_item_count := - get_aggregation_total_buckets_doc_count(source_nested_bucket)) > 0: + get_aggregation_total_buckets_doc_count(source_nested_bucket)) > 0: # noqa target_bucket["doc_count"] += source_nested_bucket_item_count merged_item_count += source_nested_bucket_item_count elif merged_item_count > 0: @@ -336,7 +339,8 @@ def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[ def normalize_elasticsearch_aggregation_results(aggregation: dict, additional_properties: Optional[dict] = None, - remove_empty_items: bool = True) -> dict: + remove_empty_items: bool = True, + retain_original_item_count: bool = False) -> dict: """ Normalizes the given result of an ElasticSearch aggregation query into a more readable/consumable format. @@ -422,7 +426,7 @@ def normalize_results(aggregation: dict, key: Optional[str] = None, value: Optional[str] = None, additional_properties: Optional[dict] = None) -> dict: - nonlocal remove_empty_items + nonlocal remove_empty_items, retain_original_item_count if not (aggregation_key := get_aggregation_key(aggregation)): return {} @@ -437,12 +441,11 @@ def normalize_results(aggregation: dict, for nested_aggregation in nested_aggregations: if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): if normalized_aggregation["count"] != bucket_item_count: - # Record the original doc_count value from the raw result; - # this may be different (lesser) than the result we aggregate here - # because ElasticSearch aggregations actually are based on unique values. - # TODO: Whould we use this as the real count value though it may look wrong. - normalized_aggregation["count_original"] = bucket_item_count - # normalized_aggregation["count"] = bucket_item_count + if retain_original_item_count is True: + # The original doc_count value from the raw result may be different/lesser than/from + # the result we aggregate here because ElasticSearch aggregations actually are based + # on unique values. Should we use this as the real count value though it may look wrong. + normalized_aggregation["count"] = bucket_item_count if group_item := find_group_item(group_items, bucket_value): for normalized_aggregation_item in normalized_aggregation["items"]: group_item["items"].append(normalized_aggregation_item) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 0d5dd7213..fea7e5811 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,7 +1,9 @@ +from hms_utils.misc_utils import dj import pyramid from copy import deepcopy from typing import List, Optional from urllib.parse import urlencode +from dcicutils.misc_utils import normalize_spaces from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results @@ -55,8 +57,9 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) - include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "inovalues")) + include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) nosort = request_arg_bool(request, "nosort") + simplified = request_arg_bool(request, "simplified") debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") raw = request_arg_bool(request, "raw") @@ -124,6 +127,68 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa create_field_aggregation=create_field_aggregation) return aggregation_query[date_property_name] + def create_aggregation_query_simplified(aggregation_fields: List[str]) -> dict: + global AGGREGATION_NO_VALUE + nonlocal date_property_name, max_buckets, include_missing + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal date_property_name + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + elif field == AGGREGATION_FIELD_CELL_LINE: + script = normalize_spaces(f""" + if (doc['embedded.{AGGREGATION_FIELD_CELL_LINE}.raw'].size() > 0) {{ + return doc['embedded.{AGGREGATION_FIELD_CELL_LINE}.raw'].value; + }} else if (doc['embedded.{AGGREGATION_FIELD_DONOR}.raw'].size() > 0) {{ + return doc['embedded.{AGGREGATION_FIELD_DONOR}.raw'].value; + }} else {{ + return 'unknown'; + }} + """) + return { + "terms": { + "script": { + "source": script, + "lang": "painless" + }, + "size": max_buckets + } + } + def create_field_filter(field: str) -> Optional[dict]: # noqa + if field == AGGREGATION_FIELD_CELL_LINE: + return { + "bool": { + "should": [ + {"exists": { "field": f"embedded.{AGGREGATION_FIELD_CELL_LINE}.raw"}}, + {"exists": { "field": f"embedded.{AGGREGATION_FIELD_DONOR}.raw"}} + ], + "minimum_should_match": 1 + } + } + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation, + create_field_filter=create_field_filter) + return aggregation_query[date_property_name] + def execute_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: request = snovault_make_search_subreq(request, path=query, method="GET") results = snovault_search(None, request, custom_aggregations=aggregation_query) @@ -131,30 +196,44 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer query = create_query(request) - aggregations_by_cell_line = [ - date_property_name, - AGGREGATION_FIELD_CELL_LINE, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - - aggregations_by_donor = [ - date_property_name, - AGGREGATION_FIELD_DONOR, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - - aggregate_by_cell_line_property_name = "aggregate_by_cell_line" - aggregate_by_donor_property_name = "aggregate_by_donor" - - aggregation_query = { - aggregate_by_cell_line_property_name: create_aggregation_query(aggregations_by_cell_line), - aggregate_by_donor_property_name: create_aggregation_query(aggregations_by_donor) - } + if simplified: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query_simplified(aggregate_by_cell_line) + } + else: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregate_by_donor_property_name = "aggregate_by_donor" + aggregate_by_donor = [ + date_property_name, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line), + aggregate_by_donor_property_name: create_aggregation_query(aggregate_by_donor) + } if debug_query: return {"query": query, "aggregation_query": aggregation_query} + dj(aggregation_query) + import pdb ; pdb.set_trace() # noqa + pass raw_results = execute_query(request, query, aggregation_query) + dj(raw_results) + import pdb ; pdb.set_trace() # noqa + pass if raw: # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. @@ -173,9 +252,12 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer raw_results = deepcopy(raw_results) # otherwise may be overwritten by below prune_elasticsearch_aggregation_results(raw_results) - merged_results = merge_elasticsearch_aggregation_results( - raw_results.get(aggregate_by_cell_line_property_name), - raw_results.get(aggregate_by_donor_property_name)) + + if simplified: + merged_results = raw_results.get(aggregate_by_cell_line_property_name) + else: + merged_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), + raw_results.get(aggregate_by_donor_property_name)) # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 @@ -227,8 +309,8 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer else: additional_properties = None - normalized_results = normalize_elasticsearch_aggregation_results( - merged_results, additional_properties=additional_properties) + normalized_results = normalize_elasticsearch_aggregation_results(merged_results, + additional_properties=additional_properties) if nosort is not True: # We can sort on the aggregations by level; outermost/left to innermost/right. From 81ae204555eb6f7c3a0e6b7cb46a6e25c9b29f5f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 9 Dec 2024 17:52:22 -0500 Subject: [PATCH 20/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 62 ++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index fea7e5811..2decf5929 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -58,8 +58,9 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) + favor_donor = request_arg_bool(request, "favor_donor") nosort = request_arg_bool(request, "nosort") - simplified = request_arg_bool(request, "simplified") + legacy = request_arg_bool(request, "legacy") debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") raw = request_arg_bool(request, "raw") @@ -96,7 +97,7 @@ def create_query(request: pyramid.request.Request) -> str: query_string = query_string.replace("=%21", "%21=") return f"/search/?{query_string}" - def create_aggregation_query(aggregation_fields: List[str]) -> dict: + def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: global AGGREGATION_NO_VALUE nonlocal date_property_name, max_buckets, include_missing aggregations = [] @@ -127,9 +128,9 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa create_field_aggregation=create_field_aggregation) return aggregation_query[date_property_name] - def create_aggregation_query_simplified(aggregation_fields: List[str]) -> dict: + def create_aggregation_query(aggregation_fields: List[str]) -> dict: global AGGREGATION_NO_VALUE - nonlocal date_property_name, max_buckets, include_missing + nonlocal date_property_name, max_buckets, include_missing, favor_donor aggregations = [] if not isinstance(aggregation_fields, list): aggregation_fields = [aggregation_fields] @@ -151,11 +152,23 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa } } elif field == AGGREGATION_FIELD_CELL_LINE: + # This specializes the aggregation query to group first by the cell-line field, + # and then alternatively (if a cell-line field does not exist) by the donor field. + # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively + # look first for the donor field and then secondarily for the cell-line field. + if favor_donor: + field_one = AGGREGATION_FIELD_DONOR + field_two = AGGREGATION_FIELD_CELL_LINE + else: + field_one = AGGREGATION_FIELD_CELL_LINE + field_two = AGGREGATION_FIELD_DONOR + # Note how we prefix the result with the aggregation field name; + # this is so later we can tell which grouping/field was matched. script = normalize_spaces(f""" - if (doc['embedded.{AGGREGATION_FIELD_CELL_LINE}.raw'].size() > 0) {{ - return doc['embedded.{AGGREGATION_FIELD_CELL_LINE}.raw'].value; - }} else if (doc['embedded.{AGGREGATION_FIELD_DONOR}.raw'].size() > 0) {{ - return doc['embedded.{AGGREGATION_FIELD_DONOR}.raw'].value; + if (doc['embedded.{field_one}.raw'].size() > 0) {{ + return '{field_one}:' + doc['embedded.{field_one}.raw'].value; + }} else if (doc['embedded.{field_two}.raw'].size() > 0) {{ + return '{field_two}:' + doc['embedded.{field_two}.raw'].value; }} else {{ return 'unknown'; }} @@ -194,9 +207,24 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer results = snovault_search(None, request, custom_aggregations=aggregation_query) return results + def fixup_names_values(normalized_results: dict) -> None: + if isinstance(normalized_results, dict): + if (isinstance(name := normalized_results.get("name"), str) and + isinstance(value := normalized_results.get("value"), str)): + if (colon := value.find(":")) > 0: + if (prefix := value[0:colon]) == AGGREGATION_FIELD_CELL_LINE: + normalized_results["name"] = AGGREGATION_FIELD_CELL_LINE + normalized_results["value"] = value[colon + 1:] + elif prefix == AGGREGATION_FIELD_DONOR: + normalized_results["name"] = AGGREGATION_FIELD_DONOR + normalized_results["value"] = value[colon + 1:] + if isinstance(items := normalized_results.get("items"), list): + for element in items: + fixup_names_values(element) + query = create_query(request) - if simplified: + if not legacy: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" aggregate_by_cell_line = [ date_property_name, @@ -204,7 +232,7 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer AGGREGATION_FIELD_FILE_DESCRIPTOR ] aggregation_query = { - aggregate_by_cell_line_property_name: create_aggregation_query_simplified(aggregate_by_cell_line) + aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) } else: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" @@ -220,20 +248,14 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer AGGREGATION_FIELD_FILE_DESCRIPTOR ] aggregation_query = { - aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line), - aggregate_by_donor_property_name: create_aggregation_query(aggregate_by_donor) + aggregate_by_cell_line_property_name: create_aggregation_query_legacy(aggregate_by_cell_line), + aggregate_by_donor_property_name: create_aggregation_query_legacy(aggregate_by_donor) } if debug_query: return {"query": query, "aggregation_query": aggregation_query} - dj(aggregation_query) - import pdb ; pdb.set_trace() # noqa - pass raw_results = execute_query(request, query, aggregation_query) - dj(raw_results) - import pdb ; pdb.set_trace() # noqa - pass if raw: # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. @@ -253,7 +275,7 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer prune_elasticsearch_aggregation_results(raw_results) - if simplified: + if not legacy: merged_results = raw_results.get(aggregate_by_cell_line_property_name) else: merged_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), @@ -311,6 +333,8 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer normalized_results = normalize_elasticsearch_aggregation_results(merged_results, additional_properties=additional_properties) + if not legacy: + fixup_names_values(normalized_results) if nosort is not True: # We can sort on the aggregations by level; outermost/left to innermost/right. From 021f0b052e1e8176f5321c02cb173f35a0141a66 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 9 Dec 2024 17:53:17 -0500 Subject: [PATCH 21/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 1 - src/encoded/recent_files_summary.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 17e139d8c..55a03ddf2 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -1,4 +1,3 @@ -from hms_utils.misc_utils import dj from copy import deepcopy from typing import Any, Callable, List, Optional, Tuple, Union diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 2decf5929..d76f8e7f4 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,4 +1,3 @@ -from hms_utils.misc_utils import dj import pyramid from copy import deepcopy from typing import List, Optional From 099b525f7cc65ef2f97c40a202fc1e14e13d3172 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 11:46:02 -0500 Subject: [PATCH 22/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 14 +++ src/encoded/recent_files_summary.py | 157 ++++++++++++++++++---------- 2 files changed, 116 insertions(+), 55 deletions(-) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index f03d09328..146a45ee3 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -3,6 +3,7 @@ from dateutil.relativedelta import relativedelta import pyramid from typing import Any, List, Optional, Tuple, Union +from urllib.parse import urlencode from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string @@ -176,3 +177,16 @@ def _add_months(day: Optional[Union[datetime, date, str]] = None, nmonths: int = if isinstance(nmonths, int) and (nmonths != 0): return day + relativedelta(months=nmonths) return day + + +def create_query_string(query_arguments: dict, base: Optional[str] = None) -> str: + query_string = "" + if isinstance(query_arguments, dict): + if query_arguments := {key: value for key, value in query_arguments.items() if value is not None}: + query_string = urlencode(query_arguments, True) + # Hackishness to change "=!" to "!=" in query_string value for e.g. to turn this + # {"data_category": ["!Quality Control"]} into this: data_category&21=Quality+Control + query_string = query_string.replace("=%21", "%21=") + if isinstance(base, str) and base: + query_string = f"{base}?{query_string}" if query_string else base + return query_string diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index d76f8e7f4..c3983c07d 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,14 +1,13 @@ import pyramid from copy import deepcopy from typing import List, Optional -from urllib.parse import urlencode from dcicutils.misc_utils import normalize_spaces from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results from encoded.elasticsearch_utils import sort_normalized_aggregation_results -from encoded.endpoint_utils import parse_date_range_related_arguments +from encoded.endpoint_utils import create_query_string, parse_date_range_related_arguments from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq @@ -27,6 +26,8 @@ AGGREGATION_MAX_BUCKETS = 100 AGGREGATION_NO_VALUE = "No value" +BASE_SEARCH_QUERY = "/search/" + def recent_files_summary(request: pyramid.request.Request) -> dict: """ This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, @@ -56,6 +57,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) + include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) favor_donor = request_arg_bool(request, "favor_donor") nosort = request_arg_bool(request, "nosort") @@ -64,14 +66,27 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: debug_query = request_arg_bool(request, "debug_query") raw = request_arg_bool(request, "raw") - def create_query(request: pyramid.request.Request) -> str: + def create_base_query_arguments(request: pyramid.request.Request) -> dict: - global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES, QUERY_RECENT_MONTHS - nonlocal date_property_name + global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES types = request_args(request, "type", QUERY_FILE_TYPES) statuses = request_args(request, "status", QUERY_FILE_STATUSES) categories = request_args(request, "category", QUERY_FILE_CATEGORIES) + + base_query_arguments = { + "type": types if types else None, + "status": statuses if statuses else None, + "data_category": categories if categories else None + } + + return {key: value for key, value in base_query_arguments.items() if value is not None} + + def create_query(request: pyramid.request.Request, base_query_arguments: Optional[dict] = None) -> str: + + global BASE_SEARCH_QUERY, QUERY_RECENT_MONTHS, QUERY_INCLUDE_CURRENT_MONTH + nonlocal date_property_name + recent_months = request_arg_int(request, "nmonths", request_arg_int(request, "months", QUERY_RECENT_MONTHS)) from_date = request_arg(request, "from_date") thru_date = request_arg(request, "thru_date") @@ -80,56 +95,23 @@ def create_query(request: pyramid.request.Request) -> str: from_date, thru_date = parse_date_range_related_arguments(from_date, thru_date, nmonths=recent_months, include_current_month=include_current_month, strings=True) - query_parameters = { - "type": types if types else None, - "status": statuses if statuses else None, - "data_category": categories if categories else None, + query_arguments = { f"{date_property_name}.from": from_date if from_date else None, f"{date_property_name}.to": thru_date if from_date else None, "from": 0, "limit": 0 } - query_parameters = {key: value for key, value in query_parameters.items() if value is not None} - query_string = urlencode(query_parameters, True) - # Hackishness to change "=!" to "!=" in search_param_lists value for e.g. to turn this in the - # query_parameters above "data_category": ["!Quality Control"] into: data_category&21=Quality+Control - query_string = query_string.replace("=%21", "%21=") - return f"/search/?{query_string}" - def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: - global AGGREGATION_NO_VALUE - nonlocal date_property_name, max_buckets, include_missing - aggregations = [] - if not isinstance(aggregation_fields, list): - aggregation_fields = [aggregation_fields] - for item in aggregation_fields: - if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): - aggregations.append(item) - if not aggregations: - return {} - def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal date_property_name - if field == date_property_name: - return { - "date_histogram": { - "field": f"embedded.{field}", - "calendar_interval": "month", - "format": "yyyy-MM", - "missing": "1970-01", - "order": {"_key": "desc"} - } - } - aggregation_query = create_elasticsearch_aggregation_query( - aggregations, - max_buckets=max_buckets, - missing_value=AGGREGATION_NO_VALUE, - include_missing=include_missing, - create_field_aggregation=create_field_aggregation) - return aggregation_query[date_property_name] + if isinstance(base_query_arguments, dict): + query_arguments = {**base_query_arguments, **query_arguments} + + return f"{BASE_SEARCH_QUERY}?{create_query_string(query_arguments)}" def create_aggregation_query(aggregation_fields: List[str]) -> dict: + global AGGREGATION_NO_VALUE nonlocal date_property_name, max_buckets, include_missing, favor_donor + aggregations = [] if not isinstance(aggregation_fields, list): aggregation_fields = [aggregation_fields] @@ -138,6 +120,7 @@ def create_aggregation_query(aggregation_fields: List[str]) -> dict: aggregations.append(item) if not aggregations: return {} + def create_field_aggregation(field: str) -> Optional[dict]: # noqa nonlocal date_property_name if field == date_property_name: @@ -162,7 +145,8 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa field_one = AGGREGATION_FIELD_CELL_LINE field_two = AGGREGATION_FIELD_DONOR # Note how we prefix the result with the aggregation field name; - # this is so later we can tell which grouping/field was matched. + # this is so later we can tell which grouping/field was matched; + # see fixup_names_values_for_normalized_results for this fixup. script = normalize_spaces(f""" if (doc['embedded.{field_one}.raw'].size() > 0) {{ return '{field_one}:' + doc['embedded.{field_one}.raw'].value; @@ -181,6 +165,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa "size": max_buckets } } + def create_field_filter(field: str) -> Optional[dict]: # noqa if field == AGGREGATION_FIELD_CELL_LINE: return { @@ -192,6 +177,7 @@ def create_field_filter(field: str) -> Optional[dict]: # noqa "minimum_should_match": 1 } } + aggregation_query = create_elasticsearch_aggregation_query( aggregations, max_buckets=max_buckets, @@ -199,6 +185,43 @@ def create_field_filter(field: str) -> Optional[dict]: # noqa include_missing=include_missing, create_field_aggregation=create_field_aggregation, create_field_filter=create_field_filter) + + return aggregation_query[date_property_name] + + def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: + + global AGGREGATION_NO_VALUE + nonlocal date_property_name, max_buckets, include_missing + + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal date_property_name + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation) + return aggregation_query[date_property_name] def execute_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: @@ -206,10 +229,10 @@ def execute_query(request: pyramid.request.Request, query: str, aggregation_quer results = snovault_search(None, request, custom_aggregations=aggregation_query) return results - def fixup_names_values(normalized_results: dict) -> None: + def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: + nonlocal include_queries if isinstance(normalized_results, dict): - if (isinstance(name := normalized_results.get("name"), str) and - isinstance(value := normalized_results.get("value"), str)): + if isinstance(value := normalized_results.get("value"), str): if (colon := value.find(":")) > 0: if (prefix := value[0:colon]) == AGGREGATION_FIELD_CELL_LINE: normalized_results["name"] = AGGREGATION_FIELD_CELL_LINE @@ -219,9 +242,31 @@ def fixup_names_values(normalized_results: dict) -> None: normalized_results["value"] = value[colon + 1:] if isinstance(items := normalized_results.get("items"), list): for element in items: - fixup_names_values(element) + fixup_names_values_for_normalized_results(element) + + def add_queries_to_normalized_results(normalized_results: dict, base_query_arguments: dict) -> None: + global BASE_SEARCH_QUERY + nonlocal date_property_name + if isinstance(normalized_results, dict): + if not (name := normalized_results.get("name")): + normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) + elif value := normalized_results.get("value"): + if name == date_property_name: + # Special case for date value which is just year/month (e.g. 2024-12); + # we want to turn this into a date range query for the month. + from_date, thru_date = parse_date_range_related_arguments(value, None, strings=True) + if from_date and thru_date: + base_query_arguments = {**base_query_arguments, + f"{name}.from": from_date, f"{name}.to": thru_date} + else: + base_query_arguments = {**base_query_arguments, name: value} + normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) + if isinstance(items := normalized_results.get("items"), list): + for element in items: + add_queries_to_normalized_results(element, base_query_arguments) - query = create_query(request) + base_query_arguments = create_base_query_arguments(request) + query = create_query(request, base_query_arguments) if not legacy: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" @@ -285,9 +330,9 @@ def fixup_names_values(normalized_results: dict) -> None: # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), # then its doc_count will NOT be counted TWICE. This creates a situation where it might LOOK like the counts # are WRONG in the MERGED (via returned merge_elasticsearch_aggregation_results) result set, where the outer - # item count may be than the sum of the individual counts within each sub-group. For example, the below result shows - # a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other in the HG005 it would be because - # the same unique file has a cell_line.code of both HG00438 and HG005. + # item count may be than the sum of the individual counts within each sub-group. For example, the below result + # shows a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other + # in the HG005 it would be because the same unique file has a cell_line.code of both HG00438 and HG005. # { # "meta": { "field_name": "file_status_tracking.released" }, # "buckets": [ @@ -333,7 +378,9 @@ def fixup_names_values(normalized_results: dict) -> None: normalized_results = normalize_elasticsearch_aggregation_results(merged_results, additional_properties=additional_properties) if not legacy: - fixup_names_values(normalized_results) + fixup_names_values_for_normalized_results(normalized_results) + if include_queries: + add_queries_to_normalized_results(normalized_results, base_query_arguments) if nosort is not True: # We can sort on the aggregations by level; outermost/left to innermost/right. From eac4384965bfccc0cb5422621d3350e82728a81c Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 12:52:37 -0500 Subject: [PATCH 23/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 33 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index c3983c07d..96958c05b 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -97,9 +97,7 @@ def create_query(request: pyramid.request.Request, base_query_arguments: Optiona strings=True) query_arguments = { f"{date_property_name}.from": from_date if from_date else None, - f"{date_property_name}.to": thru_date if from_date else None, - "from": 0, - "limit": 0 + f"{date_property_name}.to": thru_date if from_date else None } if isinstance(base_query_arguments, dict): @@ -224,7 +222,8 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa return aggregation_query[date_property_name] - def execute_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: + def execute_aggregation_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: + query += "&limit=0" # needed for aggregation query to not return the actual/individual item results. request = snovault_make_search_subreq(request, path=query, method="GET") results = snovault_search(None, request, custom_aggregations=aggregation_query) return results @@ -248,18 +247,17 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum global BASE_SEARCH_QUERY nonlocal date_property_name if isinstance(normalized_results, dict): - if not (name := normalized_results.get("name")): - normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) - elif value := normalized_results.get("value"): - if name == date_property_name: - # Special case for date value which is just year/month (e.g. 2024-12); - # we want to turn this into a date range query for the month. - from_date, thru_date = parse_date_range_related_arguments(value, None, strings=True) - if from_date and thru_date: - base_query_arguments = {**base_query_arguments, - f"{name}.from": from_date, f"{name}.to": thru_date} - else: - base_query_arguments = {**base_query_arguments, name: value} + if name := normalized_results.get("name"): + if value := normalized_results.get("value"): + if name == date_property_name: + # Special case for date value which is just year/month (e.g. 2024-12); + # we want to turn this into a date range query for the month. + from_date, thru_date = parse_date_range_related_arguments(value, None, strings=True) + if from_date and thru_date: + base_query_arguments = {**base_query_arguments, + f"{name}.from": from_date, f"{name}.to": thru_date} + else: + base_query_arguments = {**base_query_arguments, name: value} normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) if isinstance(items := normalized_results.get("items"), list): for element in items: @@ -299,7 +297,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum if debug_query: return {"query": query, "aggregation_query": aggregation_query} - raw_results = execute_query(request, query, aggregation_query) + raw_results = execute_aggregation_query(request, query, aggregation_query) if raw: # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. @@ -381,6 +379,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum fixup_names_values_for_normalized_results(normalized_results) if include_queries: add_queries_to_normalized_results(normalized_results, base_query_arguments) + normalized_results["query"] = query if nosort is not True: # We can sort on the aggregations by level; outermost/left to innermost/right. From 6f244875b1f23ebeca4d820b16b7e3da71841ede Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 18:46:17 -0500 Subject: [PATCH 24/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 34 +++++++ src/encoded/recent_files_summary.py | 137 +++++++++++++++++++++++++++- 2 files changed, 168 insertions(+), 3 deletions(-) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 146a45ee3..83566a5d5 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -190,3 +190,37 @@ def create_query_string(query_arguments: dict, base: Optional[str] = None) -> st if isinstance(base, str) and base: query_string = f"{base}?{query_string}" if query_string else base return query_string + + +def get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: bool = False) -> List[Any]: + """ + TODO: Move this to dcicutils. Maybe much of the above too. + Returns the values of the given property name within the given dictionary as a list, where the + given property name can be a dot-separated list of property names, which indicate a path into + nested dictionaries within the given dictionary; and - where if any of the elements within + the path are lists then we iterate through each, collecting the values for each and including + each within the list of returned values. + """ + if isinstance(data, dict) and isinstance(name, str) and name: + if keys := name.split("."): + nkeys = len(keys) ; key_index_max = nkeys - 1 # noqa + for key_index in range(nkeys): + if (value := data.get(keys[key_index], None)) is not None: + if key_index == key_index_max: + return [value] + elif isinstance(value, dict): + data = value + continue + elif isinstance(value, list) and value and ((sub_key_index := key_index + 1) < nkeys): + sub_key = ".".join(keys[sub_key_index:]) + values = [] + for element in value: + if isinstance(element_value := get_properties(element, sub_key), list): + for element_value_item in element_value: + if (element_value_item is not None) and (element_value_item not in values): + values.append(element_value_item) + elif (element_value is not None) and (element_value not in values): + values.append(element_value) + return sorted(values) if (sort is True) else values + break + return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 96958c05b..13b4ed3a1 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -19,10 +19,17 @@ QUERY_INCLUDE_CURRENT_MONTH = True AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" +AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" AGGREGATION_FIELD_DONOR = "donors.display_title" AGGREGATION_FIELD_FILE_DESCRIPTOR = "release_tracker_description" +AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR = [ + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_DONOR +] + AGGREGATION_MAX_BUCKETS = 100 AGGREGATION_NO_VALUE = "No value" @@ -59,11 +66,13 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) + nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) favor_donor = request_arg_bool(request, "favor_donor") nosort = request_arg_bool(request, "nosort") legacy = request_arg_bool(request, "legacy") debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") + troubleshoot = request_arg_bool(request, "troubleshoot") raw = request_arg_bool(request, "raw") def create_base_query_arguments(request: pyramid.request.Request) -> dict: @@ -120,7 +129,7 @@ def create_aggregation_query(aggregation_fields: List[str]) -> dict: return {} def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal date_property_name + nonlocal date_property_name, nomixtures if field == date_property_name: return { "date_histogram": { @@ -132,6 +141,44 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa } } elif field == AGGREGATION_FIELD_CELL_LINE: + # This specializes the aggregation query to group first by the cell-line field, + # and then alternatively (if a cell-line field does not exist) by the donor field. + # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively + # look first for the donor field and then secondarily for the cell-line field. + aggregation_field_grouping = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) + if nomixtures: + aggregation_field_grouping.remove(AGGREGATION_FIELD_CELL_MIXTURE) + if favor_donor: + aggregation_field_grouping.remove(AGGREGATION_FIELD_DONOR) + aggregation_field_grouping.insert(0, AGGREGATION_FIELD_DONOR) + # Note how we prefix the result with the aggregation field name; + # this is so later we can tell which grouping/field was matched; + # see fixup_names_values_for_normalized_results for this fixup. + script = "" + for aggregation_field_grouping_index in range(len(aggregation_field_grouping)): + aggregation_field = aggregation_field_grouping[aggregation_field_grouping_index] + if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; + }} + """ + script += f""" + else {{ + return 'unknown'; + }} + """ + return { + "terms": { + "script": { + "source": script, + "lang": "painless" + }, + "size": max_buckets + } + } + elif False and (field == AGGREGATION_FIELD_CELL_LINE): + # OBSOLETE: See above. # This specializes the aggregation query to group first by the cell-line field, # and then alternatively (if a cell-line field does not exist) by the donor field. # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively @@ -165,6 +212,13 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa } def create_field_filter(field: str) -> Optional[dict]: # noqa + if field == AGGREGATION_FIELD_CELL_LINE: + filter = {"bool": {"should": [], "minimum_should_match": 1}} + for aggregation_field in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + filter["bool"]["should"].append({"exists": { "field": f"embedded.{aggregation_field}.raw"}}) + return filter + + def obsolete_create_field_filter(field: str) -> Optional[dict]: # noqa if field == AGGREGATION_FIELD_CELL_LINE: return { "bool": { @@ -229,7 +283,19 @@ def execute_aggregation_query(request: pyramid.request.Request, query: str, aggr return results def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: - nonlocal include_queries + global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR + if isinstance(normalized_results, dict): + if isinstance(value := normalized_results.get("value"), str): + if ((separator_index := value.find(":")) > 0) and (value_prefix := value[0:separator_index]): + if value_prefix in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + if value := value[separator_index + 1:]: + normalized_results["name"] = value_prefix + normalized_results["value"] = value + if isinstance(items := normalized_results.get("items"), list): + for element in items: + fixup_names_values_for_normalized_results(element) + + def obsolete_fixup_names_values_for_normalized_results(normalized_results: dict) -> None: if isinstance(normalized_results, dict): if isinstance(value := normalized_results.get("value"), str): if (colon := value.find(":")) > 0: @@ -381,11 +447,76 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum add_queries_to_normalized_results(normalized_results, base_query_arguments) normalized_results["query"] = query - if nosort is not True: + if not nosort: # We can sort on the aggregations by level; outermost/left to innermost/right. # In our case the outermost is the date aggregation so sort taht by the key value, # e.g. 2014-12, descending; and the rest of the inner levels by the default # sorting which is by aggregation count descending and secondarily by the key value. sort_normalized_aggregation_results(normalized_results, ["-key", "default"]) + if troubleshoot: + add_info_for_troubleshooting(normalized_results, request) + + return normalized_results + + +def add_info_for_troubleshooting(normalized_results: dict, request: pyramid.request.Request) -> None: + + from encoded.endpoint_utils import get_properties, parse_datetime_string + + def get_files(files, property_name, property_value, map_property_value = None): + found = [] + for file in files: + if properties := get_properties(file, property_name): + if callable(map_property_value): + mapped_properties = [] + for value in properties: + mapped_properties.append(map_property_value(value)) + properties = mapped_properties + if property_value in properties: + found.append(file) + return found + + def map_date_property_value(value): + if date_value := parse_datetime_string(value): + return f"{date_value.year}-{date_value.month:02}" + return value + + def annotate_with_uuids(normalized_results: dict): + aggregation_fields = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + query = normalized_results.get("query") + files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] + for first_item in normalized_results["items"]: + first_property_name = first_item["name"] + first_property_value = first_item["value"] + for second_item in first_item["items"]: + second_property_name = second_item["name"] + second_property_value = second_item["value"] + for third_item in second_item["items"]: + third_property_name = third_item["name"] + third_property_value = third_item["value"] + if first_files := get_files(files, first_property_name, first_property_value, + map_property_value=map_date_property_value): + if second_files := get_files(first_files, second_property_name, second_property_value): + if third_files := get_files(second_files, third_property_name, third_property_value): + for file in third_files: + if isinstance(uuid := file.get("uuid"), str): + if not third_item.get("uuids"): + third_item["uuids"] = [] + uuid_record = {"uuid": uuid} + for aggregation_field in aggregation_fields: + uuid_record[aggregation_field] = \ + ", ".join(get_properties(file, aggregation_field)) + third_item["uuids"].append(uuid_record) + + try: + annotate_with_uuids(normalized_results) + except Exception: + pass From 1a362632dd8276681294f922995069d684befe42 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 18:53:45 -0500 Subject: [PATCH 25/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 13b4ed3a1..1469ec213 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -488,6 +488,8 @@ def annotate_with_uuids(normalized_results: dict): AGGREGATION_FIELD_RELEASE_DATE, AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, + "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", + "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", AGGREGATION_FIELD_DONOR, AGGREGATION_FIELD_FILE_DESCRIPTOR ] From 828ae52bd37b9e8aa62dd9c49c73bb0151bf27b1 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 19:04:34 -0500 Subject: [PATCH 26/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 1469ec213..d7e92d314 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -483,6 +483,13 @@ def map_date_property_value(value): return f"{date_value.year}-{date_value.month:02}" return value + def contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> bool: + for uuid_record in uuid_records: + if id(uuid_record) != ignore_uuid_record_id: + if uuid_record.get("uuid") == uuid: + return True + return False + def annotate_with_uuids(normalized_results: dict): aggregation_fields = [ AGGREGATION_FIELD_RELEASE_DATE, @@ -493,6 +500,7 @@ def annotate_with_uuids(normalized_results: dict): AGGREGATION_FIELD_DONOR, AGGREGATION_FIELD_FILE_DESCRIPTOR ] + uuid_records = [] query = normalized_results.get("query") files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] for first_item in normalized_results["items"]: @@ -517,6 +525,11 @@ def annotate_with_uuids(normalized_results: dict): uuid_record[aggregation_field] = \ ", ".join(get_properties(file, aggregation_field)) third_item["uuids"].append(uuid_record) + uuid_records.append(uuid_record) + + for uuid_record in uuid_records: + if contains_uuid(uuid_records, uuid_record["uuid"], id(uuid_record)): + uuid_record["duplicative"] = True try: annotate_with_uuids(normalized_results) From df7c33cd19aad9cd253e3863bd46979cd4398a1c Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 19:09:25 -0500 Subject: [PATCH 27/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index d7e92d314..fadde3edf 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -483,13 +483,21 @@ def map_date_property_value(value): return f"{date_value.year}-{date_value.month:02}" return value - def contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> bool: + def obsolete_contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> bool: for uuid_record in uuid_records: if id(uuid_record) != ignore_uuid_record_id: if uuid_record.get("uuid") == uuid: return True return False + def contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> int: + count = 0 + for uuid_record in uuid_records: + if id(uuid_record) != ignore_uuid_record_id: + if uuid_record.get("uuid") == uuid: + count += 1 + return count + def annotate_with_uuids(normalized_results: dict): aggregation_fields = [ AGGREGATION_FIELD_RELEASE_DATE, @@ -528,8 +536,8 @@ def annotate_with_uuids(normalized_results: dict): uuid_records.append(uuid_record) for uuid_record in uuid_records: - if contains_uuid(uuid_records, uuid_record["uuid"], id(uuid_record)): - uuid_record["duplicative"] = True + if (count := contains_uuid(uuid_records, uuid_record["uuid"], id(uuid_record))) > 0: + uuid_record["duplicative"] = count try: annotate_with_uuids(normalized_results) From fdbe871a710a304f21f0d0129f1a88e51884936a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 19:13:50 -0500 Subject: [PATCH 28/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index fadde3edf..16b535779 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -483,19 +483,11 @@ def map_date_property_value(value): return f"{date_value.year}-{date_value.month:02}" return value - def obsolete_contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> bool: - for uuid_record in uuid_records: - if id(uuid_record) != ignore_uuid_record_id: - if uuid_record.get("uuid") == uuid: - return True - return False - - def contains_uuid(uuid_records: List[dict], uuid: str, ignore_uuid_record_id: int) -> int: + def count_uuid(uuid_records: List[dict], uuid: str) -> int: count = 0 for uuid_record in uuid_records: - if id(uuid_record) != ignore_uuid_record_id: - if uuid_record.get("uuid") == uuid: - count += 1 + if uuid_record.get("uuid") == uuid: + count += 1 return count def annotate_with_uuids(normalized_results: dict): @@ -530,13 +522,13 @@ def annotate_with_uuids(normalized_results: dict): third_item["uuids"] = [] uuid_record = {"uuid": uuid} for aggregation_field in aggregation_fields: - uuid_record[aggregation_field] = \ - ", ".join(get_properties(file, aggregation_field)) + aggregation_values = ", ".join(get_properties(file, aggregation_field)) + uuid_record[aggregation_field] = aggregation_values or None third_item["uuids"].append(uuid_record) uuid_records.append(uuid_record) for uuid_record in uuid_records: - if (count := contains_uuid(uuid_records, uuid_record["uuid"], id(uuid_record))) > 0: + if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: uuid_record["duplicative"] = count try: From e21a5c70457d766ef5b1f1f595468913efb54262 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 10 Dec 2024 22:33:19 -0500 Subject: [PATCH 29/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 16b535779..62e6475ec 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -171,7 +171,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa return { "terms": { "script": { - "source": script, + "source": normalize_spaces(script), "lang": "painless" }, "size": max_buckets @@ -277,7 +277,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa return aggregation_query[date_property_name] def execute_aggregation_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: - query += "&limit=0" # needed for aggregation query to not return the actual/individual item results. + query += "&from=0&limit=0" # needed for aggregation query to not return the actual/individual item results. request = snovault_make_search_subreq(request, path=query, method="GET") results = snovault_search(None, request, custom_aggregations=aggregation_query) return results From 4ed1041a9f56ffc921833e6a6c4e6681a0582c18 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 08:56:28 -0500 Subject: [PATCH 30/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 15 ++++++++ src/encoded/recent_files_summary.py | 54 ++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 55a03ddf2..293dc6a24 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -402,6 +402,18 @@ def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: return doc_count return None + def get_aggregation_bucket_debug_hits(aggregation_bucket: dict) -> List[str]: + debug_hits = [] + if isinstance(aggregation_bucket, dict): + if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + if (isinstance(top_hits_debug := aggregation_bucket.get("top_hits_debug"), dict) and + isinstance(hits := top_hits_debug.get("hits"), dict) and + isinstance(hits := hits.get("hits"), list)): # noqa + for hit in hits: + if isinstance(hit, dict) and isinstance(hit := hit.get("_id"), str): + debug_hits.append(hit) + return debug_hits + def get_nested_aggregations(data: dict) -> List[dict]: results = [] if isinstance(data, dict): @@ -436,6 +448,7 @@ def normalize_results(aggregation: dict, ((bucket_item_count := get_aggregation_bucket_doc_count(bucket)) is None)): # noqa continue item_count += bucket_item_count + debug_hits = get_aggregation_bucket_debug_hits(bucket) if nested_aggregations := get_nested_aggregations(bucket): for nested_aggregation in nested_aggregations: if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): @@ -455,6 +468,8 @@ def normalize_results(aggregation: dict, else: if (remove_empty_items is False) or (bucket_item_count > 0): group_item = {"name": aggregation_key, "value": bucket_value, "count": bucket_item_count} + if debug_hits: + group_item["debug_elasticsearch_hits"] = debug_hits group_items.append(group_item) if (remove_empty_items is not False) and (not group_items): diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 62e6475ec..16e915a9d 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -73,6 +73,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: debug = request_arg_bool(request, "debug") debug_query = request_arg_bool(request, "debug_query") troubleshoot = request_arg_bool(request, "troubleshoot") + troubleshoot_elasticsearch = request_arg_bool(request, "troubleshoot_elasticsearch") raw = request_arg_bool(request, "raw") def create_base_query_arguments(request: pyramid.request.Request) -> dict: @@ -117,7 +118,7 @@ def create_query(request: pyramid.request.Request, base_query_arguments: Optiona def create_aggregation_query(aggregation_fields: List[str]) -> dict: global AGGREGATION_NO_VALUE - nonlocal date_property_name, max_buckets, include_missing, favor_donor + nonlocal date_property_name, max_buckets, include_missing, favor_donor, troubleshoot_elasticsearch aggregations = [] if not isinstance(aggregation_fields, list): @@ -238,6 +239,23 @@ def obsolete_create_field_filter(field: str) -> Optional[dict]: # noqa create_field_aggregation=create_field_aggregation, create_field_filter=create_field_filter) + if troubleshoot_elasticsearch: + def add_debug_query_to_elasticsearch_aggregation_query(aggregation: dict) -> None: # noqa + top_hits_debug = {"aggs": {"top_hits_debug": {"top_hits": {"_source": False, + "docvalue_fields": ["_id"], "size": 100 }}}} + def add_debug_query(aggs: dict) -> None: # noqa + if "aggs" in aggs: + for key, sub_agg in aggs["aggs"].items(): + add_debug_query(sub_agg) + else: + aggs.update(top_hits_debug) + for agg in aggregation["aggs"].values(): + add_debug_query(agg) + try: + add_debug_query_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + except Exception: + pass + return aggregation_query[date_property_name] def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: @@ -342,6 +360,21 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum aggregation_query = { aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) } + # print(aggregation_query) + # import json + # print(json.dumps(aggregation_query, indent=4)) + # import pdb ; pdb.set_trace() # noqa + # xxx = { "top_hits_debug": { "top_hits": { "_source": False, "docvalue_fields": ["_id"], "size": 10 } } } + # aggregation_query["aggregate_by_cell_line"]["aggs"]["file_sets.libraries.analytes.samples.sample_sources.cell_line.code"]["aggs"]["release_tracker_description"]["aggs"] = xxx +# "aggs": { +# "top_hits_debug": { +# "top_hits": { +# "_source": false, +# "docvalue_fields": ["_id"], +# "size": 10 +# } +# } +# } else: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" aggregate_by_cell_line = [ @@ -512,20 +545,33 @@ def annotate_with_uuids(normalized_results: dict): for third_item in second_item["items"]: third_property_name = third_item["name"] third_property_value = third_item["value"] + if debug_elasticsearch_hits := third_item.get("debug_elasticsearch_hits"): + if not third_item.get("debug"): + third_item["debug"] = {} + third_item["debug"]["elasticsearch_hits"] = debug_elasticsearch_hits + third_item["debug"]["elasticsearch_hits"].sort() + del third_item["debug_elasticsearch_hits"] if first_files := get_files(files, first_property_name, first_property_value, map_property_value=map_date_property_value): if second_files := get_files(first_files, second_property_name, second_property_value): if third_files := get_files(second_files, third_property_name, third_property_value): for file in third_files: if isinstance(uuid := file.get("uuid"), str): - if not third_item.get("uuids"): - third_item["uuids"] = [] + if not third_item.get("debug"): + third_item["debug"] = {} + if not third_item["debug"].get("uuids"): + third_item["debug"]["uuids"] = [] uuid_record = {"uuid": uuid} for aggregation_field in aggregation_fields: aggregation_values = ", ".join(get_properties(file, aggregation_field)) uuid_record[aggregation_field] = aggregation_values or None - third_item["uuids"].append(uuid_record) + if third_item["debug"].get("elasticsearch_hits"): + uuid_record["elasticsearch_counted"] = \ + uuid in third_item["debug"]["elasticsearch_hits"] + third_item["debug"]["uuids"].append(uuid_record) uuid_records.append(uuid_record) + if third_item.get("debug", {}).get("uuids"): + third_item["debug"]["uuids"].sort(key=lambda item: item.get("uuid")) for uuid_record in uuid_records: if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: From b5c99ce5ee38e3a32afeabcd0983e8da084bd59a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 12:50:08 -0500 Subject: [PATCH 31/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 16 ++- src/encoded/recent_files_summary.py | 158 ++++++++-------------------- 2 files changed, 58 insertions(+), 116 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index 293dc6a24..b8ab8f332 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -142,6 +142,20 @@ def create_elasticsearch_aggregation_query(fields: List[str], return aggregation +def add_debugging_to_elasticsearch_aggregation_query(aggregation_query: dict) -> None: # noqa + top_hits_debug = {"aggs": {"top_hits_debug": {"top_hits": {"_source": False, + "docvalue_fields": ["_id"], "size": 100 }}}} + def add_debug_query(aggs: dict) -> None: # noqa + if "aggs" in aggs: + for _, agg in aggs["aggs"].items(): + add_debug_query(agg) + else: + aggs.update(top_hits_debug) + if isinstance(aggregation_query, dict) and isinstance(aggs := aggregation_query.get("aggs"), dict): + for agg in aggs.values(): + add_debug_query(agg) + + def prune_elasticsearch_aggregation_results(results: dict) -> None: """ This removes any extra level(s) of aggregation (i.e. dummy_date_histogram) that may have been @@ -405,7 +419,7 @@ def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: def get_aggregation_bucket_debug_hits(aggregation_bucket: dict) -> List[str]: debug_hits = [] if isinstance(aggregation_bucket, dict): - if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + if isinstance(aggregation_bucket.get("doc_count"), int): if (isinstance(top_hits_debug := aggregation_bucket.get("top_hits_debug"), dict) and isinstance(hits := top_hits_debug.get("hits"), dict) and isinstance(hits := hits.get("hits"), list)): # noqa diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 16e915a9d..7c6f0d43d 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -2,11 +2,13 @@ from copy import deepcopy from typing import List, Optional from dcicutils.misc_utils import normalize_spaces +from encoded.elasticsearch_utils import add_debugging_to_elasticsearch_aggregation_query from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results from encoded.elasticsearch_utils import sort_normalized_aggregation_results +from encoded.elasticsearch_utils import AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE from encoded.endpoint_utils import create_query_string, parse_date_range_related_arguments from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search @@ -30,9 +32,6 @@ AGGREGATION_FIELD_DONOR ] -AGGREGATION_MAX_BUCKETS = 100 -AGGREGATION_NO_VALUE = "No value" - BASE_SEARCH_QUERY = "/search/" def recent_files_summary(request: pyramid.request.Request) -> dict: @@ -62,10 +61,13 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: released can be queried for using one or more status query arguments, e.g. status=uploaded. """ + global AGGREGATION_FIELD_RELEASE_DATE + date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) + nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell")) nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) favor_donor = request_arg_bool(request, "favor_donor") nosort = request_arg_bool(request, "nosort") @@ -75,6 +77,24 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: troubleshoot = request_arg_bool(request, "troubleshoot") troubleshoot_elasticsearch = request_arg_bool(request, "troubleshoot_elasticsearch") raw = request_arg_bool(request, "raw") + willrfix = request_arg_bool(request, "willrfix") + + def get_aggregation_field_grouping_cell_or_donor(): + # This specializes the aggregation query to group first by the cell-line field, + # and then alternatively (if a cell-line field does not exist) by the donor field. + # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively + # look first for the donor field and then secondarily for the cell-line field. + global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR + nonlocal nocells, nomixtures, favor_donor + aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) + if nocells: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_LINE) + if nomixtures: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_MIXTURE) + if favor_donor: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) + aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) + return aggregation_field_grouping_cell_or_donor def create_base_query_arguments(request: pyramid.request.Request) -> dict: @@ -117,7 +137,6 @@ def create_query(request: pyramid.request.Request, base_query_arguments: Optiona def create_aggregation_query(aggregation_fields: List[str]) -> dict: - global AGGREGATION_NO_VALUE nonlocal date_property_name, max_buckets, include_missing, favor_donor, troubleshoot_elasticsearch aggregations = [] @@ -130,7 +149,7 @@ def create_aggregation_query(aggregation_fields: List[str]) -> dict: return {} def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal date_property_name, nomixtures + nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, nocells, nomixtures if field == date_property_name: return { "date_histogram": { @@ -142,22 +161,12 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa } } elif field == AGGREGATION_FIELD_CELL_LINE: - # This specializes the aggregation query to group first by the cell-line field, - # and then alternatively (if a cell-line field does not exist) by the donor field. - # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively - # look first for the donor field and then secondarily for the cell-line field. - aggregation_field_grouping = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) - if nomixtures: - aggregation_field_grouping.remove(AGGREGATION_FIELD_CELL_MIXTURE) - if favor_donor: - aggregation_field_grouping.remove(AGGREGATION_FIELD_DONOR) - aggregation_field_grouping.insert(0, AGGREGATION_FIELD_DONOR) # Note how we prefix the result with the aggregation field name; # this is so later we can tell which grouping/field was matched; # see fixup_names_values_for_normalized_results for this fixup. script = "" - for aggregation_field_grouping_index in range(len(aggregation_field_grouping)): - aggregation_field = aggregation_field_grouping[aggregation_field_grouping_index] + for aggregation_field_grouping_index in range(len(aggregation_field_grouping_cell_or_donor)): + aggregation_field = aggregation_field_grouping_cell_or_donor[aggregation_field_grouping_index] if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" script += f""" {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ @@ -178,59 +187,15 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa "size": max_buckets } } - elif False and (field == AGGREGATION_FIELD_CELL_LINE): - # OBSOLETE: See above. - # This specializes the aggregation query to group first by the cell-line field, - # and then alternatively (if a cell-line field does not exist) by the donor field. - # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively - # look first for the donor field and then secondarily for the cell-line field. - if favor_donor: - field_one = AGGREGATION_FIELD_DONOR - field_two = AGGREGATION_FIELD_CELL_LINE - else: - field_one = AGGREGATION_FIELD_CELL_LINE - field_two = AGGREGATION_FIELD_DONOR - # Note how we prefix the result with the aggregation field name; - # this is so later we can tell which grouping/field was matched; - # see fixup_names_values_for_normalized_results for this fixup. - script = normalize_spaces(f""" - if (doc['embedded.{field_one}.raw'].size() > 0) {{ - return '{field_one}:' + doc['embedded.{field_one}.raw'].value; - }} else if (doc['embedded.{field_two}.raw'].size() > 0) {{ - return '{field_two}:' + doc['embedded.{field_two}.raw'].value; - }} else {{ - return 'unknown'; - }} - """) - return { - "terms": { - "script": { - "source": script, - "lang": "painless" - }, - "size": max_buckets - } - } def create_field_filter(field: str) -> Optional[dict]: # noqa + nonlocal aggregation_field_grouping_cell_or_donor if field == AGGREGATION_FIELD_CELL_LINE: filter = {"bool": {"should": [], "minimum_should_match": 1}} - for aggregation_field in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + for aggregation_field in aggregation_field_grouping_cell_or_donor: filter["bool"]["should"].append({"exists": { "field": f"embedded.{aggregation_field}.raw"}}) return filter - def obsolete_create_field_filter(field: str) -> Optional[dict]: # noqa - if field == AGGREGATION_FIELD_CELL_LINE: - return { - "bool": { - "should": [ - {"exists": { "field": f"embedded.{AGGREGATION_FIELD_CELL_LINE}.raw"}}, - {"exists": { "field": f"embedded.{AGGREGATION_FIELD_DONOR}.raw"}} - ], - "minimum_should_match": 1 - } - } - aggregation_query = create_elasticsearch_aggregation_query( aggregations, max_buckets=max_buckets, @@ -240,27 +205,12 @@ def obsolete_create_field_filter(field: str) -> Optional[dict]: # noqa create_field_filter=create_field_filter) if troubleshoot_elasticsearch: - def add_debug_query_to_elasticsearch_aggregation_query(aggregation: dict) -> None: # noqa - top_hits_debug = {"aggs": {"top_hits_debug": {"top_hits": {"_source": False, - "docvalue_fields": ["_id"], "size": 100 }}}} - def add_debug_query(aggs: dict) -> None: # noqa - if "aggs" in aggs: - for key, sub_agg in aggs["aggs"].items(): - add_debug_query(sub_agg) - else: - aggs.update(top_hits_debug) - for agg in aggregation["aggs"].values(): - add_debug_query(agg) - try: - add_debug_query_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) - except Exception: - pass + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) return aggregation_query[date_property_name] def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: - global AGGREGATION_NO_VALUE nonlocal date_property_name, max_buckets, include_missing aggregations = [] @@ -301,11 +251,11 @@ def execute_aggregation_query(request: pyramid.request.Request, query: str, aggr return results def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: - global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR + nonlocal aggregation_field_grouping_cell_or_donor if isinstance(normalized_results, dict): if isinstance(value := normalized_results.get("value"), str): if ((separator_index := value.find(":")) > 0) and (value_prefix := value[0:separator_index]): - if value_prefix in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + if value_prefix in aggregation_field_grouping_cell_or_donor: if value := value[separator_index + 1:]: normalized_results["name"] = value_prefix normalized_results["value"] = value @@ -313,23 +263,9 @@ def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: for element in items: fixup_names_values_for_normalized_results(element) - def obsolete_fixup_names_values_for_normalized_results(normalized_results: dict) -> None: - if isinstance(normalized_results, dict): - if isinstance(value := normalized_results.get("value"), str): - if (colon := value.find(":")) > 0: - if (prefix := value[0:colon]) == AGGREGATION_FIELD_CELL_LINE: - normalized_results["name"] = AGGREGATION_FIELD_CELL_LINE - normalized_results["value"] = value[colon + 1:] - elif prefix == AGGREGATION_FIELD_DONOR: - normalized_results["name"] = AGGREGATION_FIELD_DONOR - normalized_results["value"] = value[colon + 1:] - if isinstance(items := normalized_results.get("items"), list): - for element in items: - fixup_names_values_for_normalized_results(element) - def add_queries_to_normalized_results(normalized_results: dict, base_query_arguments: dict) -> None: global BASE_SEARCH_QUERY - nonlocal date_property_name + nonlocal date_property_name, willrfix if isinstance(normalized_results, dict): if name := normalized_results.get("name"): if value := normalized_results.get("value"): @@ -342,11 +278,18 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum f"{name}.from": from_date, f"{name}.to": thru_date} else: base_query_arguments = {**base_query_arguments, name: value} + if willrfix: + if name == AGGREGATION_FIELD_CELL_LINE: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + elif name == AGGREGATION_FIELD_DONOR: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + base_query_arguments[AGGREGATION_FIELD_CELL_LINE] = AGGREGATION_NO_VALUE normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) if isinstance(items := normalized_results.get("items"), list): for element in items: add_queries_to_normalized_results(element, base_query_arguments) + aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() base_query_arguments = create_base_query_arguments(request) query = create_query(request, base_query_arguments) @@ -360,21 +303,6 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum aggregation_query = { aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) } - # print(aggregation_query) - # import json - # print(json.dumps(aggregation_query, indent=4)) - # import pdb ; pdb.set_trace() # noqa - # xxx = { "top_hits_debug": { "top_hits": { "_source": False, "docvalue_fields": ["_id"], "size": 10 } } } - # aggregation_query["aggregate_by_cell_line"]["aggs"]["file_sets.libraries.analytes.samples.sample_sources.cell_line.code"]["aggs"]["release_tracker_description"]["aggs"] = xxx -# "aggs": { -# "top_hits_debug": { -# "top_hits": { -# "_source": false, -# "docvalue_fields": ["_id"], -# "size": 10 -# } -# } -# } else: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" aggregate_by_cell_line = [ @@ -417,9 +345,9 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum prune_elasticsearch_aggregation_results(raw_results) if not legacy: - merged_results = raw_results.get(aggregate_by_cell_line_property_name) + aggregation_results = raw_results.get(aggregate_by_cell_line_property_name) else: - merged_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), + aggregation_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), raw_results.get(aggregate_by_donor_property_name)) # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, @@ -466,13 +394,13 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum "query": query, "aggregation_query": aggregation_query, "raw_results": raw_results, - "merged_results": deepcopy(merged_results) + "aggregation_results": deepcopy(aggregation_results) } } else: additional_properties = None - normalized_results = normalize_elasticsearch_aggregation_results(merged_results, + normalized_results = normalize_elasticsearch_aggregation_results(aggregation_results, additional_properties=additional_properties) if not legacy: fixup_names_values_for_normalized_results(normalized_results) From e7420cc8e1080a59bf0c98fbd5835783093c6c9b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 12:57:48 -0500 Subject: [PATCH 32/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 7c6f0d43d..f99134aed 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -487,8 +487,8 @@ def annotate_with_uuids(normalized_results: dict): if isinstance(uuid := file.get("uuid"), str): if not third_item.get("debug"): third_item["debug"] = {} - if not third_item["debug"].get("uuids"): - third_item["debug"]["uuids"] = [] + if not third_item["debug"].get("portal_hits"): + third_item["debug"]["portal_hits"] = [] uuid_record = {"uuid": uuid} for aggregation_field in aggregation_fields: aggregation_values = ", ".join(get_properties(file, aggregation_field)) @@ -496,10 +496,10 @@ def annotate_with_uuids(normalized_results: dict): if third_item["debug"].get("elasticsearch_hits"): uuid_record["elasticsearch_counted"] = \ uuid in third_item["debug"]["elasticsearch_hits"] - third_item["debug"]["uuids"].append(uuid_record) + third_item["debug"]["portal_hits"].append(uuid_record) uuid_records.append(uuid_record) - if third_item.get("debug", {}).get("uuids"): - third_item["debug"]["uuids"].sort(key=lambda item: item.get("uuid")) + if third_item.get("debug", {}).get("portal_hits"): + third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) for uuid_record in uuid_records: if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: From da2a211cb3cc015a271a76d4b18764999eb7ab73 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 18:41:04 -0500 Subject: [PATCH 33/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index f99134aed..a774dee1f 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -69,6 +69,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell")) nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) + nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) favor_donor = request_arg_bool(request, "favor_donor") nosort = request_arg_bool(request, "nosort") legacy = request_arg_bool(request, "legacy") @@ -85,12 +86,14 @@ def get_aggregation_field_grouping_cell_or_donor(): # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively # look first for the donor field and then secondarily for the cell-line field. global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR - nonlocal nocells, nomixtures, favor_donor + nonlocal nocells, nomixtures, nodonors, favor_donor aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) if nocells: aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_LINE) if nomixtures: aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_MIXTURE) + if nodonors: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) if favor_donor: aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) @@ -149,7 +152,7 @@ def create_aggregation_query(aggregation_fields: List[str]) -> dict: return {} def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, nocells, nomixtures + nonlocal aggregation_field_grouping_cell_or_donor, date_property_name if field == date_property_name: return { "date_histogram": { @@ -401,7 +404,8 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum additional_properties = None normalized_results = normalize_elasticsearch_aggregation_results(aggregation_results, - additional_properties=additional_properties) + additional_properties=additional_properties, + remove_empty_items=not include_missing) if not legacy: fixup_names_values_for_normalized_results(normalized_results) if include_queries: From 8be133ed4e5cd87958353674ad195d01ca56dfd5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:00:37 -0500 Subject: [PATCH 34/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 114 +++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a774dee1f..c3f6a0c12 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -80,7 +80,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: raw = request_arg_bool(request, "raw") willrfix = request_arg_bool(request, "willrfix") - def get_aggregation_field_grouping_cell_or_donor(): + def get_aggregation_field_grouping_cell_or_donor() -> List[str]: # This specializes the aggregation query to group first by the cell-line field, # and then alternatively (if a cell-line field does not exist) by the donor field. # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively @@ -395,6 +395,11 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum additional_properties = { "debug": { "query": query, + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], "aggregation_query": aggregation_query, "raw_results": raw_results, "aggregation_results": deepcopy(aggregation_results) @@ -513,3 +518,110 @@ def annotate_with_uuids(normalized_results: dict): annotate_with_uuids(normalized_results) except Exception: pass + + +def print_normalized_aggregation_results(data: dict, + title: Optional[str] = None, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + uuids: bool = False, + uuid_details: bool = False, + nobold: bool = False, + verbose: bool = False) -> None: + + """ + For deveopment/troubleshooting only ... + """ + + from hms_utils.chars import chars + from hms_utils.terminal_utils import terminal_color + + def get_aggregation_fields(data: dict) -> List[str]: + if not isinstance(aggregation_fields := data.get("debug", {}).get("aggregation_query_fields"), list): + aggregation_fields = [] + return aggregation_fields + + def print_results(data: dict, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + indent: int = 0) -> None: + + nonlocal title, uuids, uuid_details, nobold, verbose + nonlocal aggregation_fields, red, green, gray, bold + + def get_hits(data: dict) -> List[str]: + hits = [] + if isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): + for portal_hit in portal_hits: + if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: + hits.append(portal_hit) + return hits + + def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: + nonlocal parent_grouping_name, parent_grouping_value + if property_value := hit.get(property_name): + if property_name == parent_grouping_name: + property_values = [] + for property_value in property_value.split(","): + if (property_value := property_value.strip()) == parent_grouping_value: + property_values.append(green(property_value)) + else: + property_values.append(property_value) + property_value = ", ".join(property_values) + return property_value + + def print_hit_property_values(hit: dict, property_name: str, + label: Optional[str] = None, prefix: Optional[str] = None) -> None: + nonlocal aggregation_fields + if property_values := format_hit_property_values(hit, property_name): + if not label: + label = property_name + property_description = f"{prefix or ""}{chars.dot_hollow} {label}: {property_values}" + if property_name not in aggregation_fields: + property_description = gray(property_description) + print(property_description) + + if not (isinstance(data, dict) and data): + return + if not (isinstance(indent, int) and (indent > 0)): + indent = 0 + spaces = (" " * indent) if indent > 0 else "" + grouping_name = data.get("name") + if isinstance(grouping_value := data.get("value"), str) and grouping_value: + grouping = bold(grouping_value) + if (verbose is True) and isinstance(grouping_name, str) and grouping_name: + grouping = f"{grouping_name} {chars.dot} {grouping}" + elif not (isinstance(grouping := title, str) and grouping): + grouping = "RESULTS" + grouping = f"{chars.diamond} {grouping}" + hits = get_hits(data) if (uuids is True) else [] + if isinstance(count := data.get("count"), int): + note = "" + if len(hits) > count: + note = red(f" {chars.rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + print(f"{spaces}{grouping}: {count}{note}") + for hit in hits: + if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: + note = "" + if hit.get("elasticsearch_counted") is False: + print(red(f"{spaces} {chars.dot} {uuid} {chars.xmark} UNCOUNTED")) + else: + print(f"{spaces} {chars.dot} {uuid} {chars.check}") + if uuid_details is True: + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", f"{spaces} ") + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", f"{spaces} ") + print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", f"{spaces} ") + if isinstance(items := data.get("items"), list): + for element in items: + print_results(element, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value, + indent=indent + 2) + + aggregation_fields = get_aggregation_fields(data) + red = lambda text: terminal_color(text, "red") # noqa + green = lambda text: terminal_color(text, "green") # noqa + gray = lambda text: terminal_color(text, "grey") # noqa + bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) + + print_results(data) From 713f50f23523b39efc6f0677375b9b7203fe2346 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:07:55 -0500 Subject: [PATCH 35/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index c3f6a0c12..a37da7b64 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -10,6 +10,7 @@ from encoded.elasticsearch_utils import sort_normalized_aggregation_results from encoded.elasticsearch_utils import AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE from encoded.endpoint_utils import create_query_string, parse_date_range_related_arguments +from encoded.endpoint_utils import get_properties, parse_datetime_string from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq @@ -67,7 +68,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) - nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell")) + nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell", True)) # N.B. default True nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) favor_donor = request_arg_bool(request, "favor_donor") @@ -433,8 +434,6 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum def add_info_for_troubleshooting(normalized_results: dict, request: pyramid.request.Request) -> None: - from encoded.endpoint_utils import get_properties, parse_datetime_string - def get_files(files, property_name, property_value, map_property_value = None): found = [] for file in files: @@ -532,7 +531,6 @@ def print_normalized_aggregation_results(data: dict, """ For deveopment/troubleshooting only ... """ - from hms_utils.chars import chars from hms_utils.terminal_utils import terminal_color From cc4f8f12523eed849fbf6ba680958bc56c5cc6e4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:18:39 -0500 Subject: [PATCH 36/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a37da7b64..bccf7312f 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -531,7 +531,6 @@ def print_normalized_aggregation_results(data: dict, """ For deveopment/troubleshooting only ... """ - from hms_utils.chars import chars from hms_utils.terminal_utils import terminal_color def get_aggregation_fields(data: dict) -> List[str]: @@ -546,6 +545,7 @@ def print_results(data: dict, nonlocal title, uuids, uuid_details, nobold, verbose nonlocal aggregation_fields, red, green, gray, bold + nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark def get_hits(data: dict) -> List[str]: hits = [] @@ -570,11 +570,11 @@ def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: def print_hit_property_values(hit: dict, property_name: str, label: Optional[str] = None, prefix: Optional[str] = None) -> None: - nonlocal aggregation_fields + nonlocal aggregation_fields, chars_dot_hollow if property_values := format_hit_property_values(hit, property_name): if not label: label = property_name - property_description = f"{prefix or ""}{chars.dot_hollow} {label}: {property_values}" + property_description = f"{prefix or ""}{chars_dot_hollow} {label}: {property_values}" if property_name not in aggregation_fields: property_description = gray(property_description) print(property_description) @@ -588,23 +588,23 @@ def print_hit_property_values(hit: dict, property_name: str, if isinstance(grouping_value := data.get("value"), str) and grouping_value: grouping = bold(grouping_value) if (verbose is True) and isinstance(grouping_name, str) and grouping_name: - grouping = f"{grouping_name} {chars.dot} {grouping}" + grouping = f"{grouping_name} {chars_dot} {grouping}" elif not (isinstance(grouping := title, str) and grouping): grouping = "RESULTS" - grouping = f"{chars.diamond} {grouping}" + grouping = f"{chars_diamond} {grouping}" hits = get_hits(data) if (uuids is True) else [] if isinstance(count := data.get("count"), int): note = "" if len(hits) > count: - note = red(f" {chars.rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") print(f"{spaces}{grouping}: {count}{note}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: note = "" if hit.get("elasticsearch_counted") is False: - print(red(f"{spaces} {chars.dot} {uuid} {chars.xmark} UNCOUNTED")) + print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) else: - print(f"{spaces} {chars.dot} {uuid} {chars.check}") + print(f"{spaces} {chars_dot} {uuid} {chars_check}") if uuid_details is True: print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", f"{spaces} ") print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", f"{spaces} ") @@ -621,5 +621,11 @@ def print_hit_property_values(hit: dict, property_name: str, green = lambda text: terminal_color(text, "green") # noqa gray = lambda text: terminal_color(text, "grey") # noqa bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) + chars_check = "✓" + chars_xmark = "✗" + chars_dot = "•" + chars_dot_hollow = "◦" + chars_diamond = "❖" + chars_rarrow_hollow = "▷" print_results(data) From c8f3ba9c82e078a59db149d6659cd508c02c7074 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:25:29 -0500 Subject: [PATCH 37/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index bccf7312f..f4e6b9d56 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -574,7 +574,7 @@ def print_hit_property_values(hit: dict, property_name: str, if property_values := format_hit_property_values(hit, property_name): if not label: label = property_name - property_description = f"{prefix or ""}{chars_dot_hollow} {label}: {property_values}" + property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" if property_name not in aggregation_fields: property_description = gray(property_description) print(property_description) From a854653d1d43a6234f733b49c2504a9c3272556f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:36:10 -0500 Subject: [PATCH 38/78] refactoring /recent_files_summary endpoint --- src/encoded/elasticsearch_utils.py | 2 +- src/encoded/recent_files_summary.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/elasticsearch_utils.py index b8ab8f332..daf4ca2e5 100644 --- a/src/encoded/elasticsearch_utils.py +++ b/src/encoded/elasticsearch_utils.py @@ -64,7 +64,7 @@ def create_elasticsearch_aggregation_query(fields: List[str], } The above example assumes that a create_field_aggregation function callable was passed as an argument - and that if/when its argument is date_created then it would have returned something like this + and that if/when its argument is date_created then it would have returned something like this: { "date_histogram": { diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index f4e6b9d56..0511d6ca6 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -50,7 +50,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: calculated property - see PR-298 (branch: sn_file_release_tracker). By default the current (assuminging partial) month IS included, so we really return info for - the past FULL three months plus for whatever time has currently elapsed for the current month. + the past FULL three months plus for whatever time has currently elapsed for the current month. Use pass the include_current_month=false query argument to NOT include the current month. The number of months of data can be controlled using the nmonths query argument, e.g. nmonths=6. @@ -59,7 +59,7 @@ def recent_files_summary(request: pyramid.request.Request) -> dict: For testing purposes, a date field other than the default file_status_tracking.released can also be specified using the date_property_name query argument. And file statuses other than - released can be queried for using one or more status query arguments, e.g. status=uploaded. + released can be queried for using one or more status query arguments, e.g. status=uploaded. """ global AGGREGATION_FIELD_RELEASE_DATE @@ -85,7 +85,7 @@ def get_aggregation_field_grouping_cell_or_donor() -> List[str]: # This specializes the aggregation query to group first by the cell-line field, # and then alternatively (if a cell-line field does not exist) by the donor field. # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively - # look first for the donor field and then secondarily for the cell-line field. + # look first for the donor field and then secondarily for the cell-line field. global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR nonlocal nocells, nomixtures, nodonors, favor_donor aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) @@ -427,7 +427,6 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum if troubleshoot: add_info_for_troubleshooting(normalized_results, request) - return normalized_results @@ -439,7 +438,7 @@ def get_files(files, property_name, property_value, map_property_value = None): for file in files: if properties := get_properties(file, property_name): if callable(map_property_value): - mapped_properties = [] + mapped_properties = [] for value in properties: mapped_properties.append(map_property_value(value)) properties = mapped_properties @@ -527,7 +526,7 @@ def print_normalized_aggregation_results(data: dict, uuid_details: bool = False, nobold: bool = False, verbose: bool = False) -> None: - + """ For deveopment/troubleshooting only ... """ @@ -554,7 +553,7 @@ def get_hits(data: dict) -> List[str]: if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: hits.append(portal_hit) return hits - + def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: nonlocal parent_grouping_name, parent_grouping_value if property_value := hit.get(property_name): @@ -567,7 +566,7 @@ def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: property_values.append(property_value) property_value = ", ".join(property_values) return property_value - + def print_hit_property_values(hit: dict, property_name: str, label: Optional[str] = None, prefix: Optional[str] = None) -> None: nonlocal aggregation_fields, chars_dot_hollow From 986067439e5c7f8342ed3df4408133c79556febf Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 11 Dec 2024 22:46:59 -0500 Subject: [PATCH 39/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 0511d6ca6..59e6e95cf 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -22,6 +22,8 @@ QUERY_INCLUDE_CURRENT_MONTH = True AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" +# FYI: Note there there is also file_sets.libraries.analytes.samples.sample_sources.display_title +# and that sometimes file_sets.libraries.analytes.samples.sample_sources.code does not exist. AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" AGGREGATION_FIELD_DONOR = "donors.display_title" From 56a72f40342b1fb77b33e3d78b0ce9a5e6eeadcf Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 11:08:09 -0500 Subject: [PATCH 40/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 59e6e95cf..afedeb365 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -22,7 +22,7 @@ QUERY_INCLUDE_CURRENT_MONTH = True AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" -# FYI: Note there there is also file_sets.libraries.analytes.samples.sample_sources.display_title +# FYI FWIW: There is also file_sets.libraries.analytes.samples.sample_sources.display_title; # and that sometimes file_sets.libraries.analytes.samples.sample_sources.code does not exist. AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" @@ -465,8 +465,10 @@ def annotate_with_uuids(normalized_results: dict): AGGREGATION_FIELD_RELEASE_DATE, AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, + # Some extra properties for troublehooting (as this whole thing is). "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", + "file_sets.libraries.analytes.samples.sample_sources.display_title", AGGREGATION_FIELD_DONOR, AGGREGATION_FIELD_FILE_DESCRIPTOR ] @@ -545,7 +547,7 @@ def print_results(data: dict, indent: int = 0) -> None: nonlocal title, uuids, uuid_details, nobold, verbose - nonlocal aggregation_fields, red, green, gray, bold + nonlocal aggregation_fields, red, green_bold, gray, bold nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark def get_hits(data: dict) -> List[str]: @@ -563,7 +565,7 @@ def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: property_values = [] for property_value in property_value.split(","): if (property_value := property_value.strip()) == parent_grouping_value: - property_values.append(green(property_value)) + property_values.append(green_bold(property_value)) else: property_values.append(property_value) property_value = ", ".join(property_values) @@ -607,9 +609,13 @@ def print_hit_property_values(hit: dict, property_name: str, else: print(f"{spaces} {chars_dot} {uuid} {chars_check}") if uuid_details is True: - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", f"{spaces} ") - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", f"{spaces} ") - print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", f"{spaces} ") + prefix = f"{spaces} " + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", prefix) + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", prefix) + # Some extra for troubleshooting (as this whole thing is). + print_hit_property_values(hit, "file_sets.libraries.analytes.samples.sample_sources.display_title", + "sample-sources-title", prefix) + print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", prefix) if isinstance(items := data.get("items"), list): for element in items: print_results(element, @@ -620,6 +626,7 @@ def print_hit_property_values(hit: dict, property_name: str, aggregation_fields = get_aggregation_fields(data) red = lambda text: terminal_color(text, "red") # noqa green = lambda text: terminal_color(text, "green") # noqa + green_bold = lambda text: terminal_color(text, "green", bold=True) # noqa gray = lambda text: terminal_color(text, "grey") # noqa bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) chars_check = "✓" From 1fe7fbfc79280d57b0e9056ceefe579f67cf078d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 11:19:59 -0500 Subject: [PATCH 41/78] minor /recent_files_summary refactor mostly for troubleshooting --- src/encoded/recent_files_summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index afedeb365..cad61c995 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -573,9 +573,9 @@ def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: def print_hit_property_values(hit: dict, property_name: str, label: Optional[str] = None, prefix: Optional[str] = None) -> None: - nonlocal aggregation_fields, chars_dot_hollow + nonlocal verbose, aggregation_fields, chars_dot_hollow, verbose if property_values := format_hit_property_values(hit, property_name): - if not label: + if (verbose is True) or (not label): label = property_name property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" if property_name not in aggregation_fields: From f84340aae650e72a96cf4ccfb976f71cd236508e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 13:03:07 -0500 Subject: [PATCH 42/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 69 +++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index cad61c995..99cd2b266 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,6 +1,6 @@ import pyramid from copy import deepcopy -from typing import List, Optional +from typing import Callable, List, Optional, Tuple from dcicutils.misc_utils import normalize_spaces from encoded.elasticsearch_utils import add_debugging_to_elasticsearch_aggregation_query from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query @@ -522,7 +522,7 @@ def annotate_with_uuids(normalized_results: dict): pass -def print_normalized_aggregation_results(data: dict, +def print_normalized_aggregation_results(normalized_results: dict, title: Optional[str] = None, parent_grouping_name: Optional[str] = None, parent_grouping_value: Optional[str] = None, @@ -558,23 +558,63 @@ def get_hits(data: dict) -> List[str]: hits.append(portal_hit) return hits - def format_hit_property_values(hit: dict, property_name: str) -> Optional[str]: - nonlocal parent_grouping_name, parent_grouping_value + def format_hit_property_values(hit: dict, property_name: str, + color: Optional[Callable] = None) -> Optional[str]: + nonlocal parent_grouping_name, parent_grouping_value, green, green_bold if property_value := hit.get(property_name): if property_name == parent_grouping_name: property_values = [] for property_value in property_value.split(","): if (property_value := property_value.strip()) == parent_grouping_value: - property_values.append(green_bold(property_value)) + property_value = color(property_value) if callable(color) else green_bold(property_value) + property_values.append(property_value) else: property_values.append(property_value) property_value = ", ".join(property_values) + elif hit.get("elasticsearch_counted") is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): + property_value = green(property_value) return property_value + def find_where_aggregated_and_counted(uuid: str) -> Tuple[str, str]: + + nonlocal normalized_results + + def find_where(data: dict, uuid: str, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None) -> List[Tuple[str, str]]: + found_uuid_grouping_names_and_values = set() + if isinstance(data, dict): + grouping_name = data.get("name") + grouping_value = data.get("value") + if isinstance(items := data.get("items"), list): + for item in items: + if found := find_where(item, uuid, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value): + found_uuid_grouping_names_and_values.update(found) + elif isinstance(hits := data.get("debug", {}).get("portal_hits"), list): + for hit in hits: + if hit.get("uuid") == uuid: + if hit.get("elasticsearch_counted") is True: + found_uuid_grouping_names_and_values.add((parent_grouping_name, parent_grouping_value)) + return found_uuid_grouping_names_and_values + + if found_uuid_grouping_names_and_values := list(find_where(normalized_results, uuid)): + if len(found_uuid_grouping_names_and_values) > 0: + if len(found_uuid_grouping_names_and_values) > 1: + # Something is wrong; should only be at most one iterm with elasticsearch_counted set to True. + pass + return found_uuid_grouping_names_and_values[0] + return None, None + def print_hit_property_values(hit: dict, property_name: str, - label: Optional[str] = None, prefix: Optional[str] = None) -> None: + label: Optional[str] = None, + prefix: Optional[str] = None, + color: Optional[Callable] = None) -> None: nonlocal verbose, aggregation_fields, chars_dot_hollow, verbose - if property_values := format_hit_property_values(hit, property_name): + if property_values := format_hit_property_values(hit, property_name, color=color): if (verbose is True) or (not label): label = property_name property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" @@ -606,16 +646,18 @@ def print_hit_property_values(hit: dict, property_name: str, note = "" if hit.get("elasticsearch_counted") is False: print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) + color = red_bold else: print(f"{spaces} {chars_dot} {uuid} {chars_check}") + color = green_bold if uuid_details is True: prefix = f"{spaces} " - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", prefix) - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", prefix) + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", prefix=prefix, color=color) + print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", prefix=prefix, color=color) # Some extra for troubleshooting (as this whole thing is). print_hit_property_values(hit, "file_sets.libraries.analytes.samples.sample_sources.display_title", - "sample-sources-title", prefix) - print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", prefix) + "sample-sources-title", prefix=prefix, color=color) + print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", prefix=prefix, color=color) if isinstance(items := data.get("items"), list): for element in items: print_results(element, @@ -623,8 +665,9 @@ def print_hit_property_values(hit: dict, property_name: str, parent_grouping_value=grouping_value, indent=indent + 2) - aggregation_fields = get_aggregation_fields(data) + aggregation_fields = get_aggregation_fields(normalized_results) red = lambda text: terminal_color(text, "red") # noqa + red_bold = lambda text: terminal_color(text, "red", bold=True) # noqa green = lambda text: terminal_color(text, "green") # noqa green_bold = lambda text: terminal_color(text, "green", bold=True) # noqa gray = lambda text: terminal_color(text, "grey") # noqa @@ -636,4 +679,4 @@ def print_hit_property_values(hit: dict, property_name: str, chars_diamond = "❖" chars_rarrow_hollow = "▷" - print_results(data) + print_results(normalized_results) From 45917f5a6c4ce91eaf5f93d56880c1f5cab94be7 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 14:09:31 -0500 Subject: [PATCH 43/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 99cd2b266..ddb5daea3 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -560,7 +560,7 @@ def get_hits(data: dict) -> List[str]: def format_hit_property_values(hit: dict, property_name: str, color: Optional[Callable] = None) -> Optional[str]: - nonlocal parent_grouping_name, parent_grouping_value, green, green_bold + nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow if property_value := hit.get(property_name): if property_name == parent_grouping_name: property_values = [] @@ -574,7 +574,7 @@ def format_hit_property_values(hit: dict, property_name: str, elif hit.get("elasticsearch_counted") is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): - property_value = green(property_value) + property_value = green(f"{property_value} {chars_larrow_hollow} COUNTED HERE") return property_value def find_where_aggregated_and_counted(uuid: str) -> Tuple[str, str]: @@ -678,5 +678,6 @@ def print_hit_property_values(hit: dict, property_name: str, chars_dot_hollow = "◦" chars_diamond = "❖" chars_rarrow_hollow = "▷" + chars_larrow_hollow = "◁" print_results(normalized_results) From 975f297a8bd5f37c0ced69c94cee5c033919615b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 16:43:17 -0500 Subject: [PATCH 44/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index ddb5daea3..f6f9e5804 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -529,6 +529,7 @@ def print_normalized_aggregation_results(normalized_results: dict, uuids: bool = False, uuid_details: bool = False, nobold: bool = False, + checks: bool = True, verbose: bool = False) -> None: """ @@ -536,6 +537,8 @@ def print_normalized_aggregation_results(normalized_results: dict, """ from hms_utils.terminal_utils import terminal_color + global AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, AGGREGATION_FIELD_DONOR + def get_aggregation_fields(data: dict) -> List[str]: if not isinstance(aggregation_fields := data.get("debug", {}).get("aggregation_query_fields"), list): aggregation_fields = [] @@ -550,7 +553,7 @@ def print_results(data: dict, nonlocal aggregation_fields, red, green_bold, gray, bold nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark - def get_hits(data: dict) -> List[str]: + def get_portal_hits(data: dict) -> List[dict]: hits = [] if isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): for portal_hit in portal_hits: @@ -635,11 +638,20 @@ def print_hit_property_values(hit: dict, property_name: str, elif not (isinstance(grouping := title, str) and grouping): grouping = "RESULTS" grouping = f"{chars_diamond} {grouping}" - hits = get_hits(data) if (uuids is True) else [] + hits = get_portal_hits(data) if (uuids is True) else [] if isinstance(count := data.get("count"), int): note = "" if len(hits) > count: note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + elif checks is True: + if isinstance(items := data.get("items"), list): + subcount = 0 + for item in items: + if isinstance(subcount_item := item.get("count"), int): + subcount += subcount_item + note = f" {chars_check}" if subcount == count else f" {chars_xmark}" + else: + note = f" {chars_check}" print(f"{spaces}{grouping}: {count}{note}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: @@ -652,9 +664,10 @@ def print_hit_property_values(hit: dict, property_name: str, color = green_bold if uuid_details is True: prefix = f"{spaces} " + # Show property values for troubleshooting (as this whole thing is); + # see add_info_for_troubleshooting.annotate_with_uuids. print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", prefix=prefix, color=color) print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", prefix=prefix, color=color) - # Some extra for troubleshooting (as this whole thing is). print_hit_property_values(hit, "file_sets.libraries.analytes.samples.sample_sources.display_title", "sample-sources-title", prefix=prefix, color=color) print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", prefix=prefix, color=color) From 6eba0a8eb167390dcbd02c2bfda6d0c87a059a70 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 12 Dec 2024 18:13:54 -0500 Subject: [PATCH 45/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 83566a5d5..1a51a6db0 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -125,6 +125,10 @@ def parse_datetime_string(value: Union[str, datetime, date], # Special case to accept for example "2024-10" to mean "2024-10-01". value = f"{value}-01" last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 6) and value[0:4].isdigit() and value[4:].isdigit(): + # Special case to accept for example "202410" to mean "2024-10-01". + value = f"{value[0:4]}-{value[4:]}-01" + last_day_of_month = last_day_of_month_if_no_day elif (len(value) == 7) and (value[2] == "/") and value[0:2].isdigit() and value[3:].isdigit(): # Special case to accept for example "11/2024" to mean "2024-11-01". value = f"{value[3:]}-{value[0:2]}-01" From 77a01712716ddd6159f4778ecd54fc046d034bac Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 13 Dec 2024 00:08:43 -0500 Subject: [PATCH 46/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 31 ++++++++++++++++++++++------- src/encoded/recent_files_summary.py | 20 +++++++++++-------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 1a51a6db0..3a026722c 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -1,17 +1,17 @@ import calendar from datetime import date, datetime from dateutil.relativedelta import relativedelta -import pyramid +from pyramid.request import Request as PyramidRequest from typing import Any, List, Optional, Tuple, Union from urllib.parse import urlencode from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string -def request_arg(request: pyramid.request.Request, name: str, fallback: Optional[str] = None) -> Optional[str]: +def request_arg(request: PyramidRequest, name: str, fallback: Optional[str] = None) -> Optional[str]: return str(value).strip() if (value := request.params.get(name, None)) is not None else fallback -def request_arg_int(request: pyramid.request.Request, name: str, fallback: Optional[int] = 0) -> Optional[Any]: +def request_arg_int(request: PyramidRequest, name: str, fallback: Optional[int] = 0) -> Optional[Any]: if (value := request_arg(request, name)) is not None: try: return int(value) @@ -20,11 +20,11 @@ def request_arg_int(request: pyramid.request.Request, name: str, fallback: Optio return fallback -def request_arg_bool(request: pyramid.request.Request, name: str, fallback: Optional[bool] = False) -> Optional[bool]: +def request_arg_bool(request: PyramidRequest, name: str, fallback: Optional[bool] = False) -> Optional[bool]: return fallback if (value := request_arg(request, name)) is None else (value.lower() == "true") -def request_args(request: pyramid.request.Request, +def request_args(request: PyramidRequest, name: str, fallback: Optional[str] = None, duplicates: bool = False) -> List[str]: args = [] if isinstance(value := request.params.getall(name), list): @@ -70,7 +70,22 @@ def parse_date_range_related_arguments( nmonths arguments represents a non-zero integer, in which case the returned from/thru dates will represent the past (absolute value) nmonths months starting with the month previous to the month of "today"; however if the include_current_month is True it is rather the past nmonths starting with the month of "today". + + FYI WRT smaht-portal/elasticsearch behavior and dates, when using a query like date_created.from=2024-11-01 + and date_created.to=2024-10-31, what is actually passed to the elasticsearch filter/range query looks like: + + "range": { + "date_created": { + "gte": "2024-10-31 00:00", + "lte": "lte": "2024-12-31 23:59" + } + } + + I.e. so the "from" date is from the very BEGINNING of the date/day (00:00) and and greater-than-or-EQUAL + to and the "thru" date is thru the very END of the date/day (23:59). This is actually done by the method + snovault.search.lucene_builder.LuceneBuilder.handle_range_filters. """ + include_current_month = include_current_month is True from_date = parse_datetime_string(from_date, notz=True) thru_date = parse_datetime_string(thru_date, last_day_of_month_if_no_day=True, notz=True) if not isinstance(nmonths, int): @@ -93,14 +108,16 @@ def parse_date_range_related_arguments( from_date = _add_months(thru_date, nmonths) elif nmonths == 0: from_date = _get_first_date_of_month(thru_date) - elif isinstance(nmonths, int) and ((nmonths := abs(nmonths)) != 0): + elif ((nmonths := abs(nmonths)) != 0) or include_current_month: # If no (valid) from/thru dates given, but the absolute value of nmonths is a non-zero integer, then returns # from/thru dates for the last nmonths month ending with the last day of month previous to the current month. # thru_date = _add_months(_get_last_date_of_month(), -1) thru_date = _get_last_date_of_month() - if include_current_month is not True: + if not include_current_month: thru_date = _add_months(thru_date, -1) + nmonths -= 1 from_date = _add_months(thru_date, -nmonths) + from_date = _get_first_date_of_month(from_date) if strings is True: return (from_date.strftime(f"%Y-%m-%d") if from_date else None, thru_date.strftime(f"%Y-%m-%d") if thru_date else None) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index f6f9e5804..7e230e874 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,4 +1,4 @@ -import pyramid +from pyramid.request import Request as PyramidRequest from copy import deepcopy from typing import Callable, List, Optional, Tuple from dcicutils.misc_utils import normalize_spaces @@ -37,7 +37,7 @@ BASE_SEARCH_QUERY = "/search/" -def recent_files_summary(request: pyramid.request.Request) -> dict: +def recent_files_summary(request: PyramidRequest) -> dict: """ This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, by default, info for files released withing the past three months grouped by release-date, @@ -102,7 +102,7 @@ def get_aggregation_field_grouping_cell_or_donor() -> List[str]: aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) return aggregation_field_grouping_cell_or_donor - def create_base_query_arguments(request: pyramid.request.Request) -> dict: + def create_base_query_arguments(request: PyramidRequest) -> dict: global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES @@ -118,7 +118,7 @@ def create_base_query_arguments(request: pyramid.request.Request) -> dict: return {key: value for key, value in base_query_arguments.items() if value is not None} - def create_query(request: pyramid.request.Request, base_query_arguments: Optional[dict] = None) -> str: + def create_query_arguments(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: global BASE_SEARCH_QUERY, QUERY_RECENT_MONTHS, QUERY_INCLUDE_CURRENT_MONTH nonlocal date_property_name @@ -138,7 +138,9 @@ def create_query(request: pyramid.request.Request, base_query_arguments: Optiona if isinstance(base_query_arguments, dict): query_arguments = {**base_query_arguments, **query_arguments} + return query_arguments + def create_query(query_arguments: Optional[dict] = None) -> str: return f"{BASE_SEARCH_QUERY}?{create_query_string(query_arguments)}" def create_aggregation_query(aggregation_fields: List[str]) -> dict: @@ -250,7 +252,7 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa return aggregation_query[date_property_name] - def execute_aggregation_query(request: pyramid.request.Request, query: str, aggregation_query: dict) -> str: + def execute_aggregation_query(request: PyramidRequest, query: str, aggregation_query: dict) -> str: query += "&from=0&limit=0" # needed for aggregation query to not return the actual/individual item results. request = snovault_make_search_subreq(request, path=query, method="GET") results = snovault_search(None, request, custom_aggregations=aggregation_query) @@ -297,7 +299,8 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() base_query_arguments = create_base_query_arguments(request) - query = create_query(request, base_query_arguments) + query_arguments = create_query_arguments(request, base_query_arguments) + query = create_query(query_arguments) if not legacy: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" @@ -328,7 +331,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum } if debug_query: - return {"query": query, "aggregation_query": aggregation_query} + return {"query": query, "query_arguments": query_arguments, "aggregation_query": aggregation_query} raw_results = execute_aggregation_query(request, query, aggregation_query) @@ -398,6 +401,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum additional_properties = { "debug": { "query": query, + "query_arguments": query_arguments, "aggregation_query_fields": [ AGGREGATION_FIELD_RELEASE_DATE, *get_aggregation_field_grouping_cell_or_donor(), @@ -433,7 +437,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum return normalized_results -def add_info_for_troubleshooting(normalized_results: dict, request: pyramid.request.Request) -> None: +def add_info_for_troubleshooting(normalized_results: dict, request: PyramidRequest) -> None: def get_files(files, property_name, property_value, map_property_value = None): found = [] From 2b0ad4b9797cf574769ffafdc045d47ed66eaa67 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 13 Dec 2024 11:10:59 -0500 Subject: [PATCH 47/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 45 ++++++++------ src/encoded/recent_files_summary.py | 25 +++++--- src/encoded/tests/test_endpoint_utils.py | 78 ++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 28 deletions(-) create mode 100644 src/encoded/tests/test_endpoint_utils.py diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 3a026722c..c1f6a8c08 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -45,7 +45,7 @@ def parse_date_range_related_arguments( from_date: Optional[Union[str, datetime, date]], thru_date: Optional[Union[str, datetime, date]], nmonths: Optional[Union[str, int]] = None, - include_current_month: bool = True, + include_current_month: Optional[bool] = True, strings: bool = False) -> Tuple[Optional[Union[str, datetime]], Optional[Union[str, datetime]]]: """ @@ -53,8 +53,11 @@ def parse_date_range_related_arguments( Given dates may be datetime or date objects or strings. Returned dates are datetime objects, or if the the given strings arguments is True, then strings (formatted as YYYY-MM-DD). - If both of the given from/thru dates are specified/valid then those are returned - and the given nmonths argument is not used. + If BOTH of the given from/thru dates are specified/valid then those are parsed and returned; + and the given nmonths and include_current_month arguments are NOT used in this case. + + Note that the include_current_month argument is used ONLY if NEITHER from NOR thru date + are specified; and note that its default value is True. If only the given from date is specified then a None thru date is returned, UNLESS the given nmonths argument represents a positive integer, in which case the returned thru date will be nmonths months @@ -85,42 +88,48 @@ def parse_date_range_related_arguments( to and the "thru" date is thru the very END of the date/day (23:59). This is actually done by the method snovault.search.lucene_builder.LuceneBuilder.handle_range_filters. """ - include_current_month = include_current_month is True from_date = parse_datetime_string(from_date, notz=True) thru_date = parse_datetime_string(thru_date, last_day_of_month_if_no_day=True, notz=True) - if not isinstance(nmonths, int): - if isinstance(nmonths, str) and (nmonths := nmonths.strip()): - try: - nmonths = int(nmonths) - except Exception: + if nmonths is None: + nmonths = 0 + nmonths_none = True + else: + nmonths_none = False + if not isinstance(nmonths, int): + if isinstance(nmonths, str) and (nmonths := nmonths.strip()): + try: + nmonths = int(nmonths) + except Exception: + nmonths = 0 + else: nmonths = 0 - else: - nmonths = 0 if from_date: if (not thru_date) and isinstance(nmonths, int): if nmonths > 0: thru_date = _add_months(from_date, nmonths) - elif nmonths == 0: + elif (nmonths == 0) and (not nmonths_none): thru_date = _get_last_date_of_month(from_date) elif thru_date: if isinstance(nmonths, int): if nmonths < 0: from_date = _add_months(thru_date, nmonths) - elif nmonths == 0: + elif (nmonths == 0) and (not nmonths_none): from_date = _get_first_date_of_month(thru_date) - elif ((nmonths := abs(nmonths)) != 0) or include_current_month: + elif ((nmonths := abs(nmonths)) != 0) or (include_current_month is not False): # If no (valid) from/thru dates given, but the absolute value of nmonths is a non-zero integer, then returns # from/thru dates for the last nmonths month ending with the last day of month previous to the current month. # thru_date = _add_months(_get_last_date_of_month(), -1) thru_date = _get_last_date_of_month() - if not include_current_month: - thru_date = _add_months(thru_date, -1) + if include_current_month is False: + thru_date = _get_last_date_of_month(_add_months(thru_date, -1)) nmonths -= 1 from_date = _add_months(thru_date, -nmonths) from_date = _get_first_date_of_month(from_date) if strings is True: - return (from_date.strftime(f"%Y-%m-%d") if from_date else None, - thru_date.strftime(f"%Y-%m-%d") if thru_date else None) + from_date = from_date.strftime(f"%Y-%m-%d") if from_date else None + thru_date = thru_date.strftime(f"%Y-%m-%d") if thru_date else None + if from_date and thru_date and thru_date < from_date: + from_date, thru_date = thru_date, from_date return from_date, thru_date diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 7e230e874..a1a71fe69 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -533,7 +533,8 @@ def print_normalized_aggregation_results(normalized_results: dict, uuids: bool = False, uuid_details: bool = False, nobold: bool = False, - checks: bool = True, + checks: bool = False, + query: bool = False, verbose: bool = False) -> None: """ @@ -553,7 +554,7 @@ def print_results(data: dict, parent_grouping_value: Optional[str] = None, indent: int = 0) -> None: - nonlocal title, uuids, uuid_details, nobold, verbose + nonlocal title, uuids, uuid_details, nobold, query, verbose nonlocal aggregation_fields, red, green_bold, gray, bold nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark @@ -647,16 +648,20 @@ def print_hit_property_values(hit: dict, property_name: str, note = "" if len(hits) > count: note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") - elif checks is True: - if isinstance(items := data.get("items"), list): - subcount = 0 - for item in items: - if isinstance(subcount_item := item.get("count"), int): - subcount += subcount_item - note = f" {chars_check}" if subcount == count else f" {chars_xmark}" - else: + elif isinstance(items := data.get("items"), list): + subcount = 0 + for item in items: + if isinstance(subcount_item := item.get("count"), int): + subcount += subcount_item + if subcount != count: + note = f" {chars_xmark}" + elif checks is True: note = f" {chars_check}" + elif checks: + note = f" {chars_check}" print(f"{spaces}{grouping}: {count}{note}") + if (query is True) and (query_string := data.get("query")): + print(f"{spaces} {query_string}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: note = "" diff --git a/src/encoded/tests/test_endpoint_utils.py b/src/encoded/tests/test_endpoint_utils.py new file mode 100644 index 000000000..b877a8b62 --- /dev/null +++ b/src/encoded/tests/test_endpoint_utils.py @@ -0,0 +1,78 @@ +from contextlib import contextmanager +import datetime +from typing import Optional, Union +from unittest.mock import patch as mock_patch +from encoded.endpoint_utils import parse_date_range_related_arguments, parse_datetime_string + +DEFAULT_MOCK_DATETIME_TODAY_VALUE = "2024-11-06 07:54:16" + + +def test_parse_date_range_related_arguments_sans_from_thru_dates(): + + def testf(nmonths, include_current_month): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified (this case). + return parse_date_range_related_arguments(None, None, nmonths=nmonths, + include_current_month=include_current_month, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf(nmonths=3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=-3, include_current_month=True) == ("2024-08-01", "2024-11-30") + assert testf(nmonths=-3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=1, include_current_month=False) == ("2024-10-01", "2024-10-31") + assert testf(nmonths=1, include_current_month=True) == ("2024-10-01", "2024-11-30") + assert testf(nmonths=0, include_current_month=False) == (None, None) + assert testf(nmonths=0, include_current_month=True) == ("2024-11-01", "2024-11-30") + + +def test_parse_date_range_related_arguments_with_from_thru_dates(): + + def testf(from_date, thru_date): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(from_date, thru_date, nmonths=None, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-05-16", "2024-08-29") == ("2024-05-16", "2024-08-29") + assert testf("2024-08-29", "2024-05-16") == ("2024-05-16", "2024-08-29") + assert testf("2024-11-04", "2035-10-06") == ("2024-11-04", "2035-10-06") + + +def test_parse_date_range_related_arguments_with_from_date(): + + def testf(from_date, nmonths): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(from_date, None, nmonths=nmonths, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-06-24", nmonths=None) == ("2024-06-24", None) + assert testf("2024-06-24", nmonths=0) == ("2024-06-24", "2024-06-30") + assert testf("2024-06-24", nmonths=1) == ("2024-06-24", "2024-07-24") + + +def test_parse_date_range_related_arguments_with_thru_date(): + + def testf(thru_date, nmonths): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(None, thru_date, nmonths=nmonths, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-06-24", nmonths=None) == (None, "2024-06-24") + assert testf("2024-06-24", nmonths=0) == ("2024-06-01", "2024-06-24") + assert testf("2024-06-24", nmonths=-1) == ("2024-05-24", "2024-06-24") + + +@contextmanager +def mocked_datetime_today(value: Optional[Union[str, datetime.datetime]] = DEFAULT_MOCK_DATETIME_TODAY_VALUE): + if isinstance(value, str): + value = parse_datetime_string(value) + if not isinstance(value, datetime.datetime): + raise Exception("Error using mocked_datetime_today function!") + class MockDateTime(datetime.datetime): # noqa + @classmethod + def today(cls): + nonlocal value ; return value # noqa + with (mock_patch("encoded.endpoint_utils.datetime", MockDateTime), mock_patch("datetime.datetime", MockDateTime)): + yield From 66cec6ea51226a2d886e1e55f166e108057adead Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 13 Dec 2024 12:17:15 -0500 Subject: [PATCH 48/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 25 ++++++++++++++++++++- src/encoded/recent_files_summary.py | 34 +++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index c1f6a8c08..2f22f6717 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -3,7 +3,7 @@ from dateutil.relativedelta import relativedelta from pyramid.request import Request as PyramidRequest from typing import Any, List, Optional, Tuple, Union -from urllib.parse import urlencode +from urllib.parse import parse_qs, urlencode from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string @@ -173,6 +173,20 @@ def parse_datetime_string(value: Union[str, datetime, date], return value +def get_date_range_for_month( + date: Union[str, datetime, date], + strings: bool = False) -> Tuple[Optional[Union[str, datetime]], Optional[Union[str, datetime]]]: + if date := parse_datetime_string(date, notz=True): + from_date = _get_first_date_of_month(date) + thru_date = _get_last_date_of_month(date) + if strings is True: + from_date = from_date.strftime(f"%Y-%m-%d") if from_date else None + thru_date = thru_date.strftime(f"%Y-%m-%d") if thru_date else None + else: + from_date = thru_date = None + return from_date, thru_date + + def _get_first_date_of_month(day: Optional[Union[datetime, date, str]] = None) -> datetime: """ Returns a datetime object representing the first day of the month of the given date; @@ -222,6 +236,15 @@ def create_query_string(query_arguments: dict, base: Optional[str] = None) -> st return query_string +def deconstruct_query_string(query_string: str) -> dict: + if isinstance(query_string, str): + if (question_mark_index := query_string.find("?")) >= 0: + query_string = query_string[question_mark_index + 1:] + query_string = query_string.replace("%21=", "=%21") + return {key: value[0] if len(value) == 1 else value for key, value in parse_qs(query_string).items()} + return {} + + def get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: bool = False) -> List[Any]: """ TODO: Move this to dcicutils. Maybe much of the above too. diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a1a71fe69..7613608dc 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -9,7 +9,8 @@ from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results from encoded.elasticsearch_utils import sort_normalized_aggregation_results from encoded.elasticsearch_utils import AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE -from encoded.endpoint_utils import create_query_string, parse_date_range_related_arguments +from encoded.endpoint_utils import create_query_string, deconstruct_query_string +from encoded.endpoint_utils import get_date_range_for_month, parse_date_range_related_arguments from encoded.endpoint_utils import get_properties, parse_datetime_string from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int from snovault.search.search import search as snovault_search @@ -140,8 +141,10 @@ def create_query_arguments(request: PyramidRequest, base_query_arguments: Option query_arguments = {**base_query_arguments, **query_arguments} return query_arguments - def create_query(query_arguments: Optional[dict] = None) -> str: - return f"{BASE_SEARCH_QUERY}?{create_query_string(query_arguments)}" + def create_query(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: + query_arguments = create_query_arguments(request, base_query_arguments) + query_string = create_query_string(query_arguments) + return f"{BASE_SEARCH_QUERY}?{query_string}" def create_aggregation_query(aggregation_fields: List[str]) -> dict: @@ -279,8 +282,10 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum if value := normalized_results.get("value"): if name == date_property_name: # Special case for date value which is just year/month (e.g. 2024-12); - # we want to turn this into a date range query for the month. - from_date, thru_date = parse_date_range_related_arguments(value, None, strings=True) + # we want to turn this into a date range query for the month; actually + # this is not a special case, this is the NORMAL case we are dealing with. + # from_date, thru_date = parse_date_range_related_arguments(value, None, nmonths=0, strings=True) + from_date, thru_date = get_date_range_for_month(value, strings=True) if from_date and thru_date: base_query_arguments = {**base_query_arguments, f"{name}.from": from_date, f"{name}.to": thru_date} @@ -298,9 +303,11 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum add_queries_to_normalized_results(element, base_query_arguments) aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() + # The base_query_arguments does not contain the from/thru dates as this is used; + # this is used to construct the query-string for the individually grouped items which + # will have the from/thru dates specifically representing their place within the group. base_query_arguments = create_base_query_arguments(request) - query_arguments = create_query_arguments(request, base_query_arguments) - query = create_query(query_arguments) + query = create_query(request, base_query_arguments) if not legacy: aggregate_by_cell_line_property_name = "aggregate_by_cell_line" @@ -331,7 +338,16 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum } if debug_query: - return {"query": query, "query_arguments": query_arguments, "aggregation_query": aggregation_query} + return { + "query": query, + "query_arguments": deconstruct_query_string(query), + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], + "aggregation_query": aggregation_query + } raw_results = execute_aggregation_query(request, query, aggregation_query) @@ -401,7 +417,7 @@ def add_queries_to_normalized_results(normalized_results: dict, base_query_argum additional_properties = { "debug": { "query": query, - "query_arguments": query_arguments, + "query_arguments": deconstruct_query_string(query), "aggregation_query_fields": [ AGGREGATION_FIELD_RELEASE_DATE, *get_aggregation_field_grouping_cell_or_donor(), From 0c1195fd48474f19399db7fe0dc186b2e8e020f2 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 13 Dec 2024 12:32:53 -0500 Subject: [PATCH 49/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 7613608dc..0f98133c1 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -598,7 +598,7 @@ def format_hit_property_values(hit: dict, property_name: str, elif hit.get("elasticsearch_counted") is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): - property_value = green(f"{property_value} {chars_larrow_hollow} COUNTED HERE") + property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") return property_value def find_where_aggregated_and_counted(uuid: str) -> Tuple[str, str]: From c546ca21076a474a9e8d195b6b2f2c9753a6437f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 13 Dec 2024 16:47:19 -0500 Subject: [PATCH 50/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 48 ++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 0f98133c1..4c7de06ef 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -75,6 +75,7 @@ def recent_files_summary(request: PyramidRequest) -> dict: nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) favor_donor = request_arg_bool(request, "favor_donor") + multi = request_arg_bool(request, "multi") nosort = request_arg_bool(request, "nosort") legacy = request_arg_bool(request, "legacy") debug = request_arg_bool(request, "debug") @@ -160,7 +161,7 @@ def create_aggregation_query(aggregation_fields: List[str]) -> dict: return {} def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal aggregation_field_grouping_cell_or_donor, date_property_name + nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, multi if field == date_property_name: return { "date_histogram": { @@ -179,11 +180,34 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa for aggregation_field_grouping_index in range(len(aggregation_field_grouping_cell_or_donor)): aggregation_field = aggregation_field_grouping_cell_or_donor[aggregation_field_grouping_index] if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" - script += f""" - {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ - return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; - }} - """ + # Note that if there are multiple values for the aggregation field just the "first" one will be chosen; + # where "first" means which was indexed first, which from an application POV is kind of arbitrary. + # If we want to make it more deterministic we could order the results (say) alphabetically like so: + # def value = doc['embedded.{aggregation_field}.raw'].stream().min((a, b) -> a.compareTo(b)).get(); + # return '{aggregation_field}:' + value; + # OR, if we actually want to aggregation on ALL values we could collect the results and return all like so: + # def values = []; + # for (value in doc['embedded.{aggregation_field}.raw']) { + # values.add('{aggregation_field}:' + value); + # } + # return values; + # But then we'd get double counting and so on. We are told in any case that these groups should be distinct. + if not multi: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; + }} + """ + else: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + def values = []; + for (value in doc['embedded.{aggregation_field}.raw']) {{ + values.add('{aggregation_field}:' + value); + }} + return values; + }} + """ script += f""" else {{ return 'unknown'; @@ -585,6 +609,10 @@ def get_portal_hits(data: dict) -> List[dict]: def format_hit_property_values(hit: dict, property_name: str, color: Optional[Callable] = None) -> Optional[str]: nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow + if hit.get("elasticsearch_counted") is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + else: + counted_grouping_name, counted_grouping_value = (None, None) if property_value := hit.get(property_name): if property_name == parent_grouping_name: property_values = [] @@ -593,7 +621,11 @@ def format_hit_property_values(hit: dict, property_name: str, property_value = color(property_value) if callable(color) else green_bold(property_value) property_values.append(property_value) else: - property_values.append(property_value) + if (counted_grouping_name, counted_grouping_value) == (property_name, property_value): + property_values.append(green_bold(f"{property_value} {chars_larrow_hollow}") + + green(" COUNTED HERE")) + else: + property_values.append(property_value) property_value = ", ".join(property_values) elif hit.get("elasticsearch_counted") is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) @@ -670,7 +702,7 @@ def print_hit_property_values(hit: dict, property_name: str, if isinstance(subcount_item := item.get("count"), int): subcount += subcount_item if subcount != count: - note = f" {chars_xmark}" + note = red(f" {chars_xmark} ACTUAL COUNT: {subcount}") elif checks is True: note = f" {chars_check}" elif checks: From 77f0537586c9be07e93b7c45cba558e2b5361fcc Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 12:04:19 -0500 Subject: [PATCH 51/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 87 ++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 4c7de06ef..6ec3e6de7 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -277,6 +277,9 @@ def create_field_aggregation(field: str) -> Optional[dict]: # noqa include_missing=include_missing, create_field_aggregation=create_field_aggregation) + if troubleshoot_elasticsearch: + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + return aggregation_query[date_property_name] def execute_aggregation_query(request: PyramidRequest, query: str, aggregation_query: dict) -> str: @@ -509,7 +512,7 @@ def annotate_with_uuids(normalized_results: dict): AGGREGATION_FIELD_RELEASE_DATE, AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, - # Some extra properties for troublehooting (as this whole thing is). + # Store some extra properties for troublehooting (as this whole thing is). "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", "file_sets.libraries.analytes.samples.sample_sources.display_title", @@ -582,21 +585,60 @@ def print_normalized_aggregation_results(normalized_results: dict, """ from hms_utils.terminal_utils import terminal_color - global AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, AGGREGATION_FIELD_DONOR - - def get_aggregation_fields(data: dict) -> List[str]: - if not isinstance(aggregation_fields := data.get("debug", {}).get("aggregation_query_fields"), list): + def get_aggregation_fields(normalized_results: dict) -> List[str]: + # Returns all noted/important aggregation fields which ARE actually being used by the query; + # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + # which is all of the possible sample-source/cell-line/donor aggregations. + global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR + if not isinstance(aggregation_fields := + normalized_results.get("debug", {}).get("aggregation_query_fields"), list): aggregation_fields = [] + for aggregation_field in aggregation_fields: + # Remove the ones we are not interested in reporting on. + if aggregation_field not in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + aggregation_fields.remove(aggregation_field) return aggregation_fields + def get_unused_aggregation_fields(normalized_results: dict) -> List[str]: + # Returns all noted/important aggregation fields which are NOT actually being used by the query; + # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + # which is all of the possible sample-source/cell-line/donor aggregations. + global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR + unused_aggregation_fields = [] + aggregation_fields = get_aggregation_fields(normalized_results) + for aggregation_field in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + if aggregation_field not in aggregation_fields: + unused_aggregation_fields.append(aggregation_field) + unused_aggregation_fields.append( + "file_sets.libraries.analytes.samples.sample_sources.display_title") + return unused_aggregation_fields + + def get_aggregation_field_labels() -> dict: + global AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, AGGREGATION_FIELD_DONOR + return { + AGGREGATION_FIELD_CELL_MIXTURE: "sample-sources", + AGGREGATION_FIELD_CELL_LINE: "cell-lines", + AGGREGATION_FIELD_DONOR: "donors", + "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" + } + + def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: + aggregation_fields = get_aggregation_fields(normalized_results) + unused_aggregation_fields = get_unused_aggregation_fields(normalized_results) + aggregation_fields_to_print = aggregation_fields + unused_aggregation_fields + for aggregation_field_label in get_aggregation_field_labels(): + if aggregation_field_label not in aggregation_fields_to_print: + aggregation_field_labels.append(aggregation_field_label) + return aggregation_fields_to_print + def print_results(data: dict, parent_grouping_name: Optional[str] = None, parent_grouping_value: Optional[str] = None, indent: int = 0) -> None: nonlocal title, uuids, uuid_details, nobold, query, verbose - nonlocal aggregation_fields, red, green_bold, gray, bold - nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark + nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green_bold, gray, bold + nonlocal aggregation_fields, aggregation_fields_to_print, aggregation_field_labels def get_portal_hits(data: dict) -> List[dict]: hits = [] @@ -669,14 +711,20 @@ def print_hit_property_values(hit: dict, property_name: str, label: Optional[str] = None, prefix: Optional[str] = None, color: Optional[Callable] = None) -> None: - nonlocal verbose, aggregation_fields, chars_dot_hollow, verbose - if property_values := format_hit_property_values(hit, property_name, color=color): - if (verbose is True) or (not label): - label = property_name + nonlocal aggregation_fields, aggregation_field_labels, chars_dot_hollow, chars_null, verbose + if not label: + label = aggregation_field_labels.get(property_name) + if (verbose is True) or (not label): + label = property_name + property_values = format_hit_property_values(hit, property_name, color=color) + if not property_values: + property_values = chars_null + if property_name not in aggregation_fields: property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" - if property_name not in aggregation_fields: - property_description = gray(property_description) - print(property_description) + property_description = gray(property_description) + else: + property_description = f"{prefix or ''}{chars_dot} {label}: {property_values}" + print(property_description) if not (isinstance(data, dict) and data): return @@ -723,11 +771,8 @@ def print_hit_property_values(hit: dict, property_name: str, prefix = f"{spaces} " # Show property values for troubleshooting (as this whole thing is); # see add_info_for_troubleshooting.annotate_with_uuids. - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_MIXTURE, "sample-sources", prefix=prefix, color=color) - print_hit_property_values(hit, AGGREGATION_FIELD_CELL_LINE, "cell-lines", prefix=prefix, color=color) - print_hit_property_values(hit, "file_sets.libraries.analytes.samples.sample_sources.display_title", - "sample-sources-title", prefix=prefix, color=color) - print_hit_property_values(hit, AGGREGATION_FIELD_DONOR, "donors", prefix=prefix, color=color) + for aggregation_field in aggregation_fields_to_print: + print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) if isinstance(items := data.get("items"), list): for element in items: print_results(element, @@ -736,6 +781,9 @@ def print_hit_property_values(hit: dict, property_name: str, indent=indent + 2) aggregation_fields = get_aggregation_fields(normalized_results) + aggregation_fields_to_print = get_aggregation_fields_to_print(normalized_results) + aggregation_field_labels = get_aggregation_field_labels() + red = lambda text: terminal_color(text, "red") # noqa red_bold = lambda text: terminal_color(text, "red", bold=True) # noqa green = lambda text: terminal_color(text, "green") # noqa @@ -749,5 +797,6 @@ def print_hit_property_values(hit: dict, property_name: str, chars_diamond = "❖" chars_rarrow_hollow = "▷" chars_larrow_hollow = "◁" + chars_null = "∅" print_results(normalized_results) From 3448c9a3a00606611f07f0826cb850bd69a3c1d2 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 12:08:00 -0500 Subject: [PATCH 52/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 6ec3e6de7..0e33314c1 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -614,6 +614,7 @@ def get_unused_aggregation_fields(normalized_results: dict) -> List[str]: return unused_aggregation_fields def get_aggregation_field_labels() -> dict: + # Shorter/nicer names for aggregation fields of interest to print. global AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, AGGREGATION_FIELD_DONOR return { AGGREGATION_FIELD_CELL_MIXTURE: "sample-sources", @@ -626,6 +627,7 @@ def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: aggregation_fields = get_aggregation_fields(normalized_results) unused_aggregation_fields = get_unused_aggregation_fields(normalized_results) aggregation_fields_to_print = aggregation_fields + unused_aggregation_fields + # Look at get_aggregation_field_labels above for other/miscellaneous fields we want to print. for aggregation_field_label in get_aggregation_field_labels(): if aggregation_field_label not in aggregation_fields_to_print: aggregation_field_labels.append(aggregation_field_label) @@ -638,7 +640,7 @@ def print_results(data: dict, nonlocal title, uuids, uuid_details, nobold, query, verbose nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green_bold, gray, bold - nonlocal aggregation_fields, aggregation_fields_to_print, aggregation_field_labels + nonlocal aggregation_fields_to_print def get_portal_hits(data: dict) -> List[dict]: hits = [] From 414dfdec71eff2ea717bb41f4a7fcda29bcef816 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 12:29:30 -0500 Subject: [PATCH 53/78] refactoring /recent_files_summary endpoint --- src/encoded/endpoint_utils.py | 2 +- src/encoded/recent_files_summary.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 2f22f6717..832868f46 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -260,7 +260,7 @@ def get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: for key_index in range(nkeys): if (value := data.get(keys[key_index], None)) is not None: if key_index == key_index_max: - return [value] + return [value] if not isinstance(value, list) else value elif isinstance(value, dict): data = value continue diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 0e33314c1..c9299446a 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -630,7 +630,7 @@ def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: # Look at get_aggregation_field_labels above for other/miscellaneous fields we want to print. for aggregation_field_label in get_aggregation_field_labels(): if aggregation_field_label not in aggregation_fields_to_print: - aggregation_field_labels.append(aggregation_field_label) + aggregation_fields_to_print.append(aggregation_field_label) return aggregation_fields_to_print def print_results(data: dict, From c6e595dad1b631529e85850d1ae398e45dd0bf30 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 16:38:25 -0500 Subject: [PATCH 54/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 70 +++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index c9299446a..a3107b463 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,6 +1,6 @@ from pyramid.request import Request as PyramidRequest from copy import deepcopy -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union from dcicutils.misc_utils import normalize_spaces from encoded.elasticsearch_utils import add_debugging_to_elasticsearch_aggregation_query from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query @@ -639,7 +639,7 @@ def print_results(data: dict, indent: int = 0) -> None: nonlocal title, uuids, uuid_details, nobold, query, verbose - nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green_bold, gray, bold + nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green, green_bold, gray, bold nonlocal aggregation_fields_to_print def get_portal_hits(data: dict) -> List[dict]: @@ -651,8 +651,9 @@ def get_portal_hits(data: dict) -> List[dict]: return hits def format_hit_property_values(hit: dict, property_name: str, - color: Optional[Callable] = None) -> Optional[str]: + color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow + counted_elsewhere = [] if hit.get("elasticsearch_counted") is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) else: @@ -668,6 +669,7 @@ def format_hit_property_values(hit: dict, property_name: str, if (counted_grouping_name, counted_grouping_value) == (property_name, property_value): property_values.append(green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE")) + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) else: property_values.append(property_value) property_value = ", ".join(property_values) @@ -675,9 +677,14 @@ def format_hit_property_values(hit: dict, property_name: str, counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") - return property_value + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) + return property_value, counted_elsewhere - def find_where_aggregated_and_counted(uuid: str) -> Tuple[str, str]: + def find_where_aggregated_and_counted( + uuid: str, + multiple: bool = False, + ignore: Optional[Union[List[Tuple[str, str]], + Tuple[str, str]]] = None) -> Union[Tuple[str, str], List[Tuple[str, str]]]: nonlocal normalized_results @@ -702,23 +709,30 @@ def find_where(data: dict, uuid: str, return found_uuid_grouping_names_and_values if found_uuid_grouping_names_and_values := list(find_where(normalized_results, uuid)): - if len(found_uuid_grouping_names_and_values) > 0: - if len(found_uuid_grouping_names_and_values) > 1: - # Something is wrong; should only be at most one iterm with elasticsearch_counted set to True. - pass - return found_uuid_grouping_names_and_values[0] - return None, None + if isinstance(ignore, tuple) and (len(ignore) == 2) and (ignore in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore) + elif isinstance(ignore, list): + for ignore_item in ignore: + if isinstance(ignore_item, tuple) and (len(ignore_item) == 2) and (ignore_item in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore_item) + if multiple is True: + return found_uuid_grouping_names_and_values + if len(found_uuid_grouping_names_and_values) > 1: + # Normally should only be at most one item with elasticsearch_counted set to True. + pass + return found_uuid_grouping_names_and_values[0] + return [(None, None)] if multiple is True else (None, None) def print_hit_property_values(hit: dict, property_name: str, label: Optional[str] = None, prefix: Optional[str] = None, - color: Optional[Callable] = None) -> None: + color: Optional[Callable] = None) -> List[Tuple[str, str]]: nonlocal aggregation_fields, aggregation_field_labels, chars_dot_hollow, chars_null, verbose if not label: label = aggregation_field_labels.get(property_name) if (verbose is True) or (not label): label = property_name - property_values = format_hit_property_values(hit, property_name, color=color) + property_values, counted_elsewhere = format_hit_property_values(hit, property_name, color=color) if not property_values: property_values = chars_null if property_name not in aggregation_fields: @@ -727,6 +741,7 @@ def print_hit_property_values(hit: dict, property_name: str, else: property_description = f"{prefix or ''}{chars_dot} {label}: {property_values}" print(property_description) + return counted_elsewhere if not (isinstance(data, dict) and data): return @@ -771,10 +786,37 @@ def print_hit_property_values(hit: dict, property_name: str, color = green_bold if uuid_details is True: prefix = f"{spaces} " + counted_elsewhere = [] # Show property values for troubleshooting (as this whole thing is); # see add_info_for_troubleshooting.annotate_with_uuids. for aggregation_field in aggregation_fields_to_print: - print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) + hit_counted_elsewhere = \ + print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) + if False and hit_counted_elsewhere: + counted_elsewhere.extend(hit_counted_elsewhere) + # See if also grouped elsewhere for our FYI. + duplicative = hit.get("duplicative") + duplicates = duplicative - 1 if isinstance(duplicative, int) else 0 + counted_groupings = find_where_aggregated_and_counted( + hit.get("uuid"), multiple=True, + ignore=counted_elsewhere + [(parent_grouping_name, parent_grouping_value)]) + if counted_groupings: + message = f"{spaces} {green(chars_rarrow_hollow)} {green('ALSO COUNTED HERE')}:" + if verbose is True: + if duplicates > 0: + message += f" {duplicates}" + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} vs {len(counted_groupings)}") + print(message) + for counted_grouping in counted_groupings: + print(f"{spaces} - {counted_grouping[0]} {green(counted_grouping[1])}") + else: + counted_grouping_values = [green(counted_grouping[1]) for counted_grouping in counted_groupings] + message = f"{message} {', '.join(counted_grouping_values)}" + if duplicates > 0: + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} {duplicates} vs {len(counted_grouping_values)}") + print(message) if isinstance(items := data.get("items"), list): for element in items: print_results(element, From b87d83a478949ac67c125c2d3b645eb50a48b44f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 16:38:54 -0500 Subject: [PATCH 55/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a3107b463..b0e6170d7 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -792,7 +792,7 @@ def print_hit_property_values(hit: dict, property_name: str, for aggregation_field in aggregation_fields_to_print: hit_counted_elsewhere = \ print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) - if False and hit_counted_elsewhere: + if hit_counted_elsewhere: counted_elsewhere.extend(hit_counted_elsewhere) # See if also grouped elsewhere for our FYI. duplicative = hit.get("duplicative") From c4f97f914a5b88609987b4845406b5f00d531e63 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 14 Dec 2024 19:33:15 -0500 Subject: [PATCH 56/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 72 +++++++++++++++-------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index b0e6170d7..6a7ea52d5 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -507,20 +507,28 @@ def count_uuid(uuid_records: List[dict], uuid: str) -> int: count += 1 return count + def dedup_list(data: list) -> list: # noqa + return list(dict.fromkeys(data)) if isinstance(data, list) else [] + + aggregation_fields_for_troubleshooting = dedup_list([ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + # Store some extra properties for troublehooting (as this whole thing is). +# "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", +# "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", + "file_sets.libraries.analytes.samples.sample_sources.display_title", + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ]) + def annotate_with_uuids(normalized_results: dict): - aggregation_fields = [ - AGGREGATION_FIELD_RELEASE_DATE, - AGGREGATION_FIELD_CELL_MIXTURE, - AGGREGATION_FIELD_CELL_LINE, - # Store some extra properties for troublehooting (as this whole thing is). - "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", - "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", - "file_sets.libraries.analytes.samples.sample_sources.display_title", - AGGREGATION_FIELD_DONOR, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] + nonlocal aggregation_fields_for_troubleshooting uuid_records = [] query = normalized_results.get("query") + if isinstance(debug := normalized_results.get("debug"), dict): + normalized_results["debug"]["aggregation_fields_for_troubleshooting"] = ( + aggregation_fields_for_troubleshooting) files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] for first_item in normalized_results["items"]: first_property_name = first_item["name"] @@ -548,7 +556,7 @@ def annotate_with_uuids(normalized_results: dict): if not third_item["debug"].get("portal_hits"): third_item["debug"]["portal_hits"] = [] uuid_record = {"uuid": uuid} - for aggregation_field in aggregation_fields: + for aggregation_field in aggregation_fields_for_troubleshooting: aggregation_values = ", ".join(get_properties(file, aggregation_field)) uuid_record[aggregation_field] = aggregation_values or None if third_item["debug"].get("elasticsearch_hits"): @@ -593,25 +601,29 @@ def get_aggregation_fields(normalized_results: dict) -> List[str]: if not isinstance(aggregation_fields := normalized_results.get("debug", {}).get("aggregation_query_fields"), list): aggregation_fields = [] + else: + aggregation_fields = deepcopy(aggregation_fields) for aggregation_field in aggregation_fields: # Remove the ones we are not interested in reporting on. if aggregation_field not in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: aggregation_fields.remove(aggregation_field) return aggregation_fields - def get_unused_aggregation_fields(normalized_results: dict) -> List[str]: - # Returns all noted/important aggregation fields which are NOT actually being used by the query; - # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, - # which is all of the possible sample-source/cell-line/donor aggregations. - global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR - unused_aggregation_fields = [] - aggregation_fields = get_aggregation_fields(normalized_results) - for aggregation_field in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: - if aggregation_field not in aggregation_fields: - unused_aggregation_fields.append(aggregation_field) - unused_aggregation_fields.append( - "file_sets.libraries.analytes.samples.sample_sources.display_title") - return unused_aggregation_fields + def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: + aggregation_fields_to_print = get_aggregation_fields(normalized_results) + if isinstance(aggregation_fields_for_troubleshooting := + normalized_results.get("debug", {}).get("aggregation_fields_for_troubleshooting"), list): + for aggregation_field_for_troubleshooting in aggregation_fields_for_troubleshooting: + if aggregation_field_for_troubleshooting not in aggregation_fields_to_print: + aggregation_fields_to_print.append(aggregation_field_for_troubleshooting) + aggregation_fields_to_not_print = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + for aggregation_field_to_not_print in aggregation_fields_to_not_print: + if aggregation_field_to_not_print in aggregation_fields_to_print: + aggregation_fields_to_print.remove(aggregation_field_to_not_print) + return aggregation_fields_to_print def get_aggregation_field_labels() -> dict: # Shorter/nicer names for aggregation fields of interest to print. @@ -623,16 +635,6 @@ def get_aggregation_field_labels() -> dict: "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" } - def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: - aggregation_fields = get_aggregation_fields(normalized_results) - unused_aggregation_fields = get_unused_aggregation_fields(normalized_results) - aggregation_fields_to_print = aggregation_fields + unused_aggregation_fields - # Look at get_aggregation_field_labels above for other/miscellaneous fields we want to print. - for aggregation_field_label in get_aggregation_field_labels(): - if aggregation_field_label not in aggregation_fields_to_print: - aggregation_fields_to_print.append(aggregation_field_label) - return aggregation_fields_to_print - def print_results(data: dict, parent_grouping_name: Optional[str] = None, parent_grouping_value: Optional[str] = None, From 18ceaa032c5bd248d28f1b181d6cc3420a71611c Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 00:05:25 -0500 Subject: [PATCH 57/78] refactoring /recent_files_summary endpoint --- src/encoded/browse.py | 64 ++++++++++++++++++++++++++++- src/encoded/recent_files_summary.py | 8 +++- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/encoded/browse.py b/src/encoded/browse.py index 3ad80270d..07935ab96 100644 --- a/src/encoded/browse.py +++ b/src/encoded/browse.py @@ -63,5 +63,67 @@ def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=F @view_config(route_name="recent_files_summary_endpoint", request_method=["GET"], effective_principals=Authenticated) @debug_log def recent_files_summary_endpoint(context, request): - results = recent_files_summary(request) + from encoded.endpoint_utils import request_arg_bool + text = request_arg_bool(request, "text") + results = recent_files_summary(request, troubleshooting=text) + if text: + import json + import os + from pyramid.response import Response + import sys + from encoded.recent_files_summary import print_normalized_aggregation_results + with capture_output_to_html_string() as captured_output: + print_normalized_aggregation_results(results, uuids=True, uuid_details=True) + text = captured_output.getvalue() + text = ansi_to_html(text) + return Response(f"
{text}
", content_type='text/html') return results + + +from contextlib import contextmanager +@contextmanager +def capture_output_to_html_string(): + from io import StringIO + from unittest.mock import patch as patch + print_original = print + captured_output = StringIO() + def captured_print(*args, **kwargs): + nonlocal captured_output + print_original(*args, **kwargs, file=captured_output) + with patch("builtins.print", captured_print): + yield captured_output + + +def ansi_to_html(text): + import re + ANSI_ESCAPE_RE = re.compile(r'\x1b\[(\d+)m') + ANSI_COLOR_MAP = { + '30': 'black', + '31': 'red', + '32': 'green', + '33': 'yellow', + '34': 'blue', + '35': 'magenta', + '36': 'cyan', + '37': 'white', + '90': 'bright_black', + '91': 'bright_red', + '92': 'bright_green', + '93': 'bright_yellow', + '94': 'bright_blue', + '95': 'bright_magenta', + '96': 'bright_cyan', + '97': 'bright_white', + } + def replace_ansi(match): + code = match.group(1) # Extract ANSI code + color = ANSI_COLOR_MAP.get(code) + if color: + return f'' + elif code == '0': # Reset code + return '' + return '' # Ignore unsupported codes + html_text = ANSI_ESCAPE_RE.sub(replace_ansi, text) + if html_text.count(' html_text.count(''): + html_text += '' + return f'
{html_text}
' diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 6a7ea52d5..640a6e959 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -38,7 +38,7 @@ BASE_SEARCH_QUERY = "/search/" -def recent_files_summary(request: PyramidRequest) -> dict: +def recent_files_summary(request: PyramidRequest, troubleshooting: bool = True) -> dict: """ This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, by default, info for files released withing the past three months grouped by release-date, @@ -65,6 +65,7 @@ def recent_files_summary(request: PyramidRequest) -> dict: released can be queried for using one or more status query arguments, e.g. status=uploaded. """ + global AGGREGATION_FIELD_RELEASE_DATE date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) @@ -85,6 +86,11 @@ def recent_files_summary(request: PyramidRequest) -> dict: raw = request_arg_bool(request, "raw") willrfix = request_arg_bool(request, "willrfix") + if troubleshooting is True: + debug = True + troubleshoot = True + troubleshoot_elasticsearch = True + def get_aggregation_field_grouping_cell_or_donor() -> List[str]: # This specializes the aggregation query to group first by the cell-line field, # and then alternatively (if a cell-line field does not exist) by the donor field. From 567f6857cdd1df0bbbd09021af3585c2d326d8d5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 00:24:24 -0500 Subject: [PATCH 58/78] refactoring /recent_files_summary endpoint --- src/encoded/browse.py | 61 ++------------------------ src/encoded/recent_files_summary.py | 67 ++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 59 deletions(-) diff --git a/src/encoded/browse.py b/src/encoded/browse.py index 07935ab96..bbda20b0a 100644 --- a/src/encoded/browse.py +++ b/src/encoded/browse.py @@ -1,4 +1,4 @@ -from pyramid.httpexceptions import HTTPBadRequest, HTTPFound +from pyramid.httpexceptions import HTTPFound from pyramid.security import Authenticated from pyramid.view import view_config import structlog @@ -67,63 +67,8 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: - import json - import os from pyramid.response import Response - import sys - from encoded.recent_files_summary import print_normalized_aggregation_results - with capture_output_to_html_string() as captured_output: - print_normalized_aggregation_results(results, uuids=True, uuid_details=True) - text = captured_output.getvalue() - text = ansi_to_html(text) + from encoded.recent_files_summary import get_normalized_aggregation_results_as_html_for_troublehshooting + text = get_normalized_aggregation_results_as_html_for_troublehshooting(results) return Response(f"
{text}
", content_type='text/html') return results - - -from contextlib import contextmanager -@contextmanager -def capture_output_to_html_string(): - from io import StringIO - from unittest.mock import patch as patch - print_original = print - captured_output = StringIO() - def captured_print(*args, **kwargs): - nonlocal captured_output - print_original(*args, **kwargs, file=captured_output) - with patch("builtins.print", captured_print): - yield captured_output - - -def ansi_to_html(text): - import re - ANSI_ESCAPE_RE = re.compile(r'\x1b\[(\d+)m') - ANSI_COLOR_MAP = { - '30': 'black', - '31': 'red', - '32': 'green', - '33': 'yellow', - '34': 'blue', - '35': 'magenta', - '36': 'cyan', - '37': 'white', - '90': 'bright_black', - '91': 'bright_red', - '92': 'bright_green', - '93': 'bright_yellow', - '94': 'bright_blue', - '95': 'bright_magenta', - '96': 'bright_cyan', - '97': 'bright_white', - } - def replace_ansi(match): - code = match.group(1) # Extract ANSI code - color = ANSI_COLOR_MAP.get(code) - if color: - return f'' - elif code == '0': # Reset code - return '' - return '' # Ignore unsupported codes - html_text = ANSI_ESCAPE_RE.sub(replace_ansi, text) - if html_text.count(' html_text.count(''): - html_text += '' - return f'
{html_text}
' diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 640a6e959..4b24b191a 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,4 +1,5 @@ from pyramid.request import Request as PyramidRequest +from contextlib import contextmanager from copy import deepcopy from typing import Callable, List, Optional, Tuple, Union from dcicutils.misc_utils import normalize_spaces @@ -532,7 +533,7 @@ def annotate_with_uuids(normalized_results: dict): nonlocal aggregation_fields_for_troubleshooting uuid_records = [] query = normalized_results.get("query") - if isinstance(debug := normalized_results.get("debug"), dict): + if isinstance(normalized_results.get("debug"), dict): normalized_results["debug"]["aggregation_fields_for_troubleshooting"] = ( aggregation_fields_for_troubleshooting) files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] @@ -852,3 +853,67 @@ def print_hit_property_values(hit: dict, property_name: str, chars_null = "∅" print_results(normalized_results) + + +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): + with capture_output_to_html_string() as captured_output: + print_normalized_aggregation_results(normalized_results, uuids=True, uuid_details=True) + return captured_output.html + return + + +@contextmanager +def capture_output_to_html_string(): + from io import StringIO + from unittest.mock import patch as patch + print_original = print + captured_output = StringIO() + class CapturedOutput: # noqa + def __init__(self, captured_output: StringIO): + self._captured_output = captured_output + @property # noqa + def text(self): + return self._captured_output.getvalue() + @property # noqa + def html(self): + return ansi_to_html(self._captured_output.getvalue()) + def captured_print(*args, **kwargs): # noqa + nonlocal captured_output + print_original(*args, **kwargs, file=captured_output) + with patch("builtins.print", captured_print): + yield CapturedOutput(captured_output) + + +def ansi_to_html(text): + import re + ANSI_ESCAPE_RE = re.compile(r'\x1b\[(\d+)m') + ANSI_COLOR_MAP = { + '30': 'black', + '31': 'red', + '32': 'green', + '33': 'yellow', + '34': 'blue', + '35': 'magenta', + '36': 'cyan', + '37': 'white', + '90': 'bright_black', + '91': 'bright_red', + '92': 'bright_green', + '93': 'bright_yellow', + '94': 'bright_blue', + '95': 'bright_magenta', + '96': 'bright_cyan', + '97': 'bright_white', + } + def replace_ansi(match): # noqa + code = match.group(1) + color = ANSI_COLOR_MAP.get(code) + if color: + return f'' + elif code == '0': + return '' + return '' + html_text = ANSI_ESCAPE_RE.sub(replace_ansi, text) + if html_text.count(' html_text.count(''): + html_text += '' + return f'
{html_text}
' From 2f45e8cfc419408c352565f51484ab6ccfbb8683 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 10:56:28 -0500 Subject: [PATCH 59/78] refactoring /recent_files_summary endpoint --- src/encoded/recent_files_summary.py | 39 ++++++++++++++++++----------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index 4b24b191a..a72830f7c 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -1,6 +1,7 @@ -from pyramid.request import Request as PyramidRequest from contextlib import contextmanager from copy import deepcopy +from pyramid.request import Request as PyramidRequest +import re from typing import Callable, List, Optional, Tuple, Union from dcicutils.misc_utils import normalize_spaces from encoded.elasticsearch_utils import add_debugging_to_elasticsearch_aggregation_query @@ -885,8 +886,6 @@ def captured_print(*args, **kwargs): # noqa def ansi_to_html(text): - import re - ANSI_ESCAPE_RE = re.compile(r'\x1b\[(\d+)m') ANSI_COLOR_MAP = { '30': 'black', '31': 'red', @@ -905,15 +904,27 @@ def ansi_to_html(text): '96': 'bright_cyan', '97': 'bright_white', } + ANSI_ESCAPE_RE = re.compile(r'\x1b\[([0-9;]*)m') + bold_active = False def replace_ansi(match): # noqa - code = match.group(1) - color = ANSI_COLOR_MAP.get(code) - if color: - return f'' - elif code == '0': - return '' - return '' - html_text = ANSI_ESCAPE_RE.sub(replace_ansi, text) - if html_text.count(' html_text.count(''): - html_text += '' - return f'
{html_text}
' + nonlocal bold_active + codes = match.group(1).split(';') # Split multiple codes (e.g., "1;31") + html_parts = [] + for code in codes: + if code == '1': # Bold + if not bold_active: # Activate bold + html_parts.append('') + bold_active = True + elif code in ANSI_COLOR_MAP: # Colors + color = ANSI_COLOR_MAP[code] + html_parts.append(f'') + elif code == '0': # Reset + if bold_active: + html_parts.append('') + bold_active = False + html_parts.append('') # Close color + return ''.join(html_parts) + text_with_html = ANSI_ESCAPE_RE.sub(replace_ansi, text) + if bold_active: + text_with_html += '' + return f'
{text_with_html}
' From 203228ca2e8175584a626819f3655c3ddb895f46 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 15:57:09 -0500 Subject: [PATCH 60/78] refactoring /recent_files_summary endpoint --- poetry.lock | 17 ++++++++++++++++- pyproject.toml | 1 + src/encoded/endpoint_utils.py | 22 ++++++++++++++++++++++ src/encoded/recent_files_summary.py | 2 +- 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 60da629ed..cad476ce2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4015,6 +4015,21 @@ setuptools = "*" [package.extras] testing = ["pytest", "pytest-cov"] +[[package]] +name = "termcolor" +version = "2.5.0" +description = "ANSI color formatting for output in terminal" +category = "main" +optional = false +python-versions = ">=3.9" +files = [ + {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"}, + {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4475,4 +4490,4 @@ test = ["zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.9.1,<3.13" -content-hash = "85d3cfc258bd495fab8caf35d943f40fb9e3c7114fcd59f1661d380fe15a0c09" +content-hash = "72b303a0100150cc88c75fceb3b9ab1f2a5123686a6ef75bf8d2e4320cb0a6a9" diff --git a/pyproject.toml b/pyproject.toml index ad7134d24..6f6938ff1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,6 +99,7 @@ structlog = ">=19.2.0,<20" subprocess-middleware = "^0.3.0" supervisor = "^4.2.4" # Useful for picking apart pyproject.toml +termcolor = "^2.4.0" toml = ">=0.10.1,<1" tqdm = "^4.59.0" transaction = "^3.0.0" diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoint_utils.py index 832868f46..ed7f9f640 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoint_utils.py @@ -2,6 +2,7 @@ from datetime import date, datetime from dateutil.relativedelta import relativedelta from pyramid.request import Request as PyramidRequest +from termcolor import colored from typing import Any, List, Optional, Tuple, Union from urllib.parse import parse_qs, urlencode from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string @@ -277,3 +278,24 @@ def get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: return sorted(values) if (sort is True) else values break return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) + + +def terminal_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + # This is used only for troubleshooting by + if nocolor is True: + return value + attributes = [] + if dark is True: + attributes.append("dark") + if bold is True: + attributes.append("bold") + if underline is True: + attributes.append("underline") + if isinstance(color, str) and color: + return colored(value, color.lower(), attrs=attributes) + return colored(value, attrs=attributes) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py index a72830f7c..06ecd3069 100644 --- a/src/encoded/recent_files_summary.py +++ b/src/encoded/recent_files_summary.py @@ -599,7 +599,7 @@ def print_normalized_aggregation_results(normalized_results: dict, """ For deveopment/troubleshooting only ... """ - from hms_utils.terminal_utils import terminal_color + from encoded.endpoint_utils import terminal_color def get_aggregation_fields(normalized_results: dict) -> List[str]: # Returns all noted/important aggregation fields which ARE actually being used by the query; From 82daa17f46bdaa0e5515165e9029a5b29a6d3161 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 17:11:58 -0500 Subject: [PATCH 61/78] refactoring /recent_files_summary endpoint --- src/encoded/browse.py | 18 +- .../{ => endpoints}/elasticsearch_utils.py | 0 src/encoded/{ => endpoints}/endpoint_utils.py | 56 -- .../recent_files_summary.py | 490 +++++++++ .../recent_files_summary_fields.py | 16 + .../recent_files_summary_troubleshooting.py | 504 ++++++++++ src/encoded/recent_files_summary.py | 930 ------------------ src/encoded/tests/test_elasticsearch_utils.py | 14 +- src/encoded/tests/test_endpoint_utils.py | 5 +- 9 files changed, 1026 insertions(+), 1007 deletions(-) rename src/encoded/{ => endpoints}/elasticsearch_utils.py (100%) rename src/encoded/{ => endpoints}/endpoint_utils.py (81%) create mode 100644 src/encoded/endpoints/recent_files_summary/recent_files_summary.py create mode 100644 src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py create mode 100644 src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py delete mode 100644 src/encoded/recent_files_summary.py diff --git a/src/encoded/browse.py b/src/encoded/browse.py index bbda20b0a..98eb0f37e 100644 --- a/src/encoded/browse.py +++ b/src/encoded/browse.py @@ -6,7 +6,7 @@ from urllib.parse import urlencode from snovault.search.search import search from snovault.util import debug_log -from encoded.recent_files_summary import recent_files_summary +from encoded.endpoints.recent_files_summary.recent_files_summary import recent_files_summary_endpoint log = structlog.getLogger(__name__) @@ -14,7 +14,7 @@ def includeme(config): config.add_route('browse', '/browse{slash:/?}') - config.add_route("recent_files_summary_endpoint", "/recent_files_summary") + config.add_route("recent_files_summary", "/recent_files_summary") config.scan(__name__) @@ -60,15 +60,7 @@ def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=F return search(context, request, search_type, return_generator, forced_type="Browse") -@view_config(route_name="recent_files_summary_endpoint", request_method=["GET"], effective_principals=Authenticated) +@view_config(route_name="recent_files_summary", request_method=["GET"], effective_principals=Authenticated) @debug_log -def recent_files_summary_endpoint(context, request): - from encoded.endpoint_utils import request_arg_bool - text = request_arg_bool(request, "text") - results = recent_files_summary(request, troubleshooting=text) - if text: - from pyramid.response import Response - from encoded.recent_files_summary import get_normalized_aggregation_results_as_html_for_troublehshooting - text = get_normalized_aggregation_results_as_html_for_troublehshooting(results) - return Response(f"
{text}
", content_type='text/html') - return results +def recent_files_summary(context, request): + return recent_files_summary_endpoint(context, request) diff --git a/src/encoded/elasticsearch_utils.py b/src/encoded/endpoints/elasticsearch_utils.py similarity index 100% rename from src/encoded/elasticsearch_utils.py rename to src/encoded/endpoints/elasticsearch_utils.py diff --git a/src/encoded/endpoint_utils.py b/src/encoded/endpoints/endpoint_utils.py similarity index 81% rename from src/encoded/endpoint_utils.py rename to src/encoded/endpoints/endpoint_utils.py index ed7f9f640..b518e3ea3 100644 --- a/src/encoded/endpoint_utils.py +++ b/src/encoded/endpoints/endpoint_utils.py @@ -2,7 +2,6 @@ from datetime import date, datetime from dateutil.relativedelta import relativedelta from pyramid.request import Request as PyramidRequest -from termcolor import colored from typing import Any, List, Optional, Tuple, Union from urllib.parse import parse_qs, urlencode from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string @@ -244,58 +243,3 @@ def deconstruct_query_string(query_string: str) -> dict: query_string = query_string.replace("%21=", "=%21") return {key: value[0] if len(value) == 1 else value for key, value in parse_qs(query_string).items()} return {} - - -def get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: bool = False) -> List[Any]: - """ - TODO: Move this to dcicutils. Maybe much of the above too. - Returns the values of the given property name within the given dictionary as a list, where the - given property name can be a dot-separated list of property names, which indicate a path into - nested dictionaries within the given dictionary; and - where if any of the elements within - the path are lists then we iterate through each, collecting the values for each and including - each within the list of returned values. - """ - if isinstance(data, dict) and isinstance(name, str) and name: - if keys := name.split("."): - nkeys = len(keys) ; key_index_max = nkeys - 1 # noqa - for key_index in range(nkeys): - if (value := data.get(keys[key_index], None)) is not None: - if key_index == key_index_max: - return [value] if not isinstance(value, list) else value - elif isinstance(value, dict): - data = value - continue - elif isinstance(value, list) and value and ((sub_key_index := key_index + 1) < nkeys): - sub_key = ".".join(keys[sub_key_index:]) - values = [] - for element in value: - if isinstance(element_value := get_properties(element, sub_key), list): - for element_value_item in element_value: - if (element_value_item is not None) and (element_value_item not in values): - values.append(element_value_item) - elif (element_value is not None) and (element_value not in values): - values.append(element_value) - return sorted(values) if (sort is True) else values - break - return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) - - -def terminal_color(value: str, - color: Optional[str] = None, - dark: bool = False, - bold: bool = False, - underline: bool = False, - nocolor: bool = False) -> str: - # This is used only for troubleshooting by - if nocolor is True: - return value - attributes = [] - if dark is True: - attributes.append("dark") - if bold is True: - attributes.append("bold") - if underline is True: - attributes.append("underline") - if isinstance(color, str) and color: - return colored(value, color.lower(), attrs=attributes) - return colored(value, attrs=attributes) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py new file mode 100644 index 000000000..f90d577c6 --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -0,0 +1,490 @@ +from copy import deepcopy +from pyramid.request import Request as PyramidRequest, Response as PyramidResponse +from typing import List, Optional +from dcicutils.misc_utils import normalize_spaces +from encoded.endpoints.elasticsearch_utils import ( + add_debugging_to_elasticsearch_aggregation_query, + create_elasticsearch_aggregation_query, + merge_elasticsearch_aggregation_results, + normalize_elasticsearch_aggregation_results, + prune_elasticsearch_aggregation_results, + sort_normalized_aggregation_results, + AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE) +from encoded.endpoints.endpoint_utils import ( + request_arg, request_args, request_arg_bool, request_arg_int, + create_query_string, deconstruct_query_string, + get_date_range_for_month, parse_date_range_related_arguments) +from encoded.endpoints.recent_files_summary.recent_files_summary_fields import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR) +from encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting import ( + add_info_for_troubleshooting, + get_normalized_aggregation_results_as_html_for_troublehshooting) +from snovault.search.search import search as snovault_search +from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq + +QUERY_FILE_TYPES = ["OutputFile"] +QUERY_FILE_STATUSES = ["released"] +QUERY_FILE_CATEGORIES = ["!Quality Control"] +QUERY_RECENT_MONTHS = 3 +QUERY_INCLUDE_CURRENT_MONTH = True +BASE_SEARCH_QUERY = "/search/" + + +def recent_files_summary_endpoint(context, request): + # This text=true support is purely for troublesooting purposes; it dumps + # terminal-like formatted output for the results returned by the query. + text = request_arg_bool(request, "text") + results = recent_files_summary(request, troubleshooting=text) + if text: + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results) + results = PyramidResponse(f"
{results}
", content_type='text/html') + return results + + +def recent_files_summary(request: PyramidRequest, troubleshooting: bool = True) -> dict: + """ + This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, + by default, info for files released withing the past three months grouped by release-date, + cell-line or donor, and file-description. The specific fields used for these groupings are: + + - release-date: file_status_tracking.released + - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code + - donor: donors.display_title + - file-dsecription: release_tracker_description + + Note that release_tracker_description is a newer (2024-12) + calculated property - see PR-298 (branch: sn_file_release_tracker). + + By default the current (assuminging partial) month IS included, so we really return info for + the past FULL three months plus for whatever time has currently elapsed for the current month. + Use pass the include_current_month=false query argument to NOT include the current month. + + The number of months of data can be controlled using the nmonths query argument, e.g. nmonths=6. + + A specific date range can also be passed in e.g. using from_date=2024-08-01 and thru_date=2024-10-31. + + For testing purposes, a date field other than the default file_status_tracking.released can + also be specified using the date_property_name query argument. And file statuses other than + released can be queried for using one or more status query arguments, e.g. status=uploaded. + """ + + date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) + max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) + include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) + include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) + nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell", True)) # N.B. default True + nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) + nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) + favor_donor = request_arg_bool(request, "favor_donor") + multi = request_arg_bool(request, "multi") + nosort = request_arg_bool(request, "nosort") + legacy = request_arg_bool(request, "legacy") + debug = request_arg_bool(request, "debug") + debug_query = request_arg_bool(request, "debug_query") + troubleshoot = request_arg_bool(request, "troubleshoot") + troubleshoot_elasticsearch = request_arg_bool(request, "troubleshoot_elasticsearch") + raw = request_arg_bool(request, "raw") + willrfix = request_arg_bool(request, "willrfix") + + if troubleshooting is True: + debug = True + troubleshoot = True + troubleshoot_elasticsearch = True + + def get_aggregation_field_grouping_cell_or_donor() -> List[str]: + # This specializes the aggregation query to group first by the cell-line field, + # and then alternatively (if a cell-line field does not exist) by the donor field. + # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively + # look first for the donor field and then secondarily for the cell-line field. + nonlocal nocells, nomixtures, nodonors, favor_donor + aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) + if nocells: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_LINE) + if nomixtures: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_MIXTURE) + if nodonors: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) + if favor_donor: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) + aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) + return aggregation_field_grouping_cell_or_donor + + def create_base_query_arguments(request: PyramidRequest) -> dict: + + global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES + + types = request_args(request, "type", QUERY_FILE_TYPES) + statuses = request_args(request, "status", QUERY_FILE_STATUSES) + categories = request_args(request, "category", QUERY_FILE_CATEGORIES) + + base_query_arguments = { + "type": types if types else None, + "status": statuses if statuses else None, + "data_category": categories if categories else None + } + + return {key: value for key, value in base_query_arguments.items() if value is not None} + + def create_query_arguments(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: + + global BASE_SEARCH_QUERY, QUERY_RECENT_MONTHS, QUERY_INCLUDE_CURRENT_MONTH + nonlocal date_property_name + + recent_months = request_arg_int(request, "nmonths", request_arg_int(request, "months", QUERY_RECENT_MONTHS)) + from_date = request_arg(request, "from_date") + thru_date = request_arg(request, "thru_date") + include_current_month = request_arg_bool(request, "include_current_month", QUERY_INCLUDE_CURRENT_MONTH) + + from_date, thru_date = parse_date_range_related_arguments(from_date, thru_date, nmonths=recent_months, + include_current_month=include_current_month, + strings=True) + query_arguments = { + f"{date_property_name}.from": from_date if from_date else None, + f"{date_property_name}.to": thru_date if from_date else None + } + + if isinstance(base_query_arguments, dict): + query_arguments = {**base_query_arguments, **query_arguments} + return query_arguments + + def create_query(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: + query_arguments = create_query_arguments(request, base_query_arguments) + query_string = create_query_string(query_arguments) + return f"{BASE_SEARCH_QUERY}?{query_string}" + + def create_aggregation_query(aggregation_fields: List[str]) -> dict: + + nonlocal date_property_name, max_buckets, include_missing, favor_donor, troubleshoot_elasticsearch + + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, multi + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + elif field == AGGREGATION_FIELD_CELL_LINE: + # Note how we prefix the result with the aggregation field name; + # this is so later we can tell which grouping/field was matched; + # see fixup_names_values_for_normalized_results for this fixup. + script = "" + for aggregation_field_grouping_index in range(len(aggregation_field_grouping_cell_or_donor)): + aggregation_field = aggregation_field_grouping_cell_or_donor[aggregation_field_grouping_index] + if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" + # Note that if there are multiple values for the aggregation field just the "first" one will be chosen; + # where "first" means which was indexed first, which from an application POV is kind of arbitrary. + # If we want to make it more deterministic we could order the results (say) alphabetically like so: + # def value = doc['embedded.{aggregation_field}.raw'].stream().min((a, b) -> a.compareTo(b)).get(); + # return '{aggregation_field}:' + value; + # OR, if we actually want to aggregation on ALL values we could collect the results and return all like so: + # def values = []; + # for (value in doc['embedded.{aggregation_field}.raw']) { + # values.add('{aggregation_field}:' + value); + # } + # return values; + # But then we'd get double counting and so on. We are told in any case that these groups should be distinct. + if not multi: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; + }} + """ + else: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + def values = []; + for (value in doc['embedded.{aggregation_field}.raw']) {{ + values.add('{aggregation_field}:' + value); + }} + return values; + }} + """ + script += f""" + else {{ + return 'unknown'; + }} + """ + return { + "terms": { + "script": { + "source": normalize_spaces(script), + "lang": "painless" + }, + "size": max_buckets + } + } + + def create_field_filter(field: str) -> Optional[dict]: # noqa + nonlocal aggregation_field_grouping_cell_or_donor + if field == AGGREGATION_FIELD_CELL_LINE: + filter = {"bool": {"should": [], "minimum_should_match": 1}} + for aggregation_field in aggregation_field_grouping_cell_or_donor: + filter["bool"]["should"].append({"exists": { "field": f"embedded.{aggregation_field}.raw"}}) + return filter + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation, + create_field_filter=create_field_filter) + + if troubleshoot_elasticsearch: + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + + return aggregation_query[date_property_name] + + def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: + + nonlocal date_property_name, max_buckets, include_missing + + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal date_property_name + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation) + + if troubleshoot_elasticsearch: + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + + return aggregation_query[date_property_name] + + def execute_aggregation_query(request: PyramidRequest, query: str, aggregation_query: dict) -> str: + query += "&from=0&limit=0" # needed for aggregation query to not return the actual/individual item results. + request = snovault_make_search_subreq(request, path=query, method="GET") + results = snovault_search(None, request, custom_aggregations=aggregation_query) + return results + + def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: + nonlocal aggregation_field_grouping_cell_or_donor + if isinstance(normalized_results, dict): + if isinstance(value := normalized_results.get("value"), str): + if ((separator_index := value.find(":")) > 0) and (value_prefix := value[0:separator_index]): + if value_prefix in aggregation_field_grouping_cell_or_donor: + if value := value[separator_index + 1:]: + normalized_results["name"] = value_prefix + normalized_results["value"] = value + if isinstance(items := normalized_results.get("items"), list): + for element in items: + fixup_names_values_for_normalized_results(element) + + def add_queries_to_normalized_results(normalized_results: dict, base_query_arguments: dict) -> None: + global BASE_SEARCH_QUERY + nonlocal date_property_name, willrfix + if isinstance(normalized_results, dict): + if name := normalized_results.get("name"): + if value := normalized_results.get("value"): + if name == date_property_name: + # Special case for date value which is just year/month (e.g. 2024-12); + # we want to turn this into a date range query for the month; actually + # this is not a special case, this is the NORMAL case we are dealing with. + # from_date, thru_date = parse_date_range_related_arguments(value, None, nmonths=0, strings=True) + from_date, thru_date = get_date_range_for_month(value, strings=True) + if from_date and thru_date: + base_query_arguments = {**base_query_arguments, + f"{name}.from": from_date, f"{name}.to": thru_date} + else: + base_query_arguments = {**base_query_arguments, name: value} + if willrfix: + if name == AGGREGATION_FIELD_CELL_LINE: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + elif name == AGGREGATION_FIELD_DONOR: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + base_query_arguments[AGGREGATION_FIELD_CELL_LINE] = AGGREGATION_NO_VALUE + normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) + if isinstance(items := normalized_results.get("items"), list): + for element in items: + add_queries_to_normalized_results(element, base_query_arguments) + + aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() + # The base_query_arguments does not contain the from/thru dates as this is used; + # this is used to construct the query-string for the individually grouped items which + # will have the from/thru dates specifically representing their place within the group. + base_query_arguments = create_base_query_arguments(request) + query = create_query(request, base_query_arguments) + + if not legacy: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) + } + else: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregate_by_donor_property_name = "aggregate_by_donor" + aggregate_by_donor = [ + date_property_name, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query_legacy(aggregate_by_cell_line), + aggregate_by_donor_property_name: create_aggregation_query_legacy(aggregate_by_donor) + } + + if debug_query: + return { + "query": query, + "query_arguments": deconstruct_query_string(query), + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], + "aggregation_query": aggregation_query + } + + raw_results = execute_aggregation_query(request, query, aggregation_query) + + if raw: + # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. + # And note that unless we remove teh @id property we get redirected to the URL in this field, + # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control + # &file_status_tracking.released.from=2024-09-30 + # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' + if "@id" in raw_results: + del raw_results["@id"] + return raw_results + + if not (raw_results := raw_results.get("aggregations")): + return {} + + if debug: + raw_results = deepcopy(raw_results) # otherwise may be overwritten by below + + prune_elasticsearch_aggregation_results(raw_results) + + if not legacy: + aggregation_results = raw_results.get(aggregate_by_cell_line_property_name) + else: + aggregation_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), + raw_results.get(aggregate_by_donor_property_name)) + + # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, + # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 + # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), + # then its doc_count will NOT be counted TWICE. This creates a situation where it might LOOK like the counts + # are WRONG in the MERGED (via returned merge_elasticsearch_aggregation_results) result set, where the outer + # item count may be than the sum of the individual counts within each sub-group. For example, the below result + # shows a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other + # in the HG005 it would be because the same unique file has a cell_line.code of both HG00438 and HG005. + # { + # "meta": { "field_name": "file_status_tracking.released" }, + # "buckets": [ + # { + # "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 1, + # "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + # "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + # "buckets": [ + # { "key": "HG00438", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "WGS Illumina NovaSeq X bam", "doc_count": 1 }, + # ] + # } + # }, + # { "key": "HG005", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "Fiber-seq PacBio Revio bam", "doc_count": 1 } + # ] + # } + # } + # ] + # } + # } + # ] + # } + + if debug: + additional_properties = { + "debug": { + "query": query, + "query_arguments": deconstruct_query_string(query), + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], + "aggregation_query": aggregation_query, + "raw_results": raw_results, + "aggregation_results": deepcopy(aggregation_results) + } + } + else: + additional_properties = None + + normalized_results = normalize_elasticsearch_aggregation_results(aggregation_results, + additional_properties=additional_properties, + remove_empty_items=not include_missing) + if not legacy: + fixup_names_values_for_normalized_results(normalized_results) + if include_queries: + add_queries_to_normalized_results(normalized_results, base_query_arguments) + normalized_results["query"] = query + + if not nosort: + # We can sort on the aggregations by level; outermost/left to innermost/right. + # In our case the outermost is the date aggregation so sort taht by the key value, + # e.g. 2014-12, descending; and the rest of the inner levels by the default + # sorting which is by aggregation count descending and secondarily by the key value. + sort_normalized_aggregation_results(normalized_results, ["-key", "default"]) + + if troubleshoot: + add_info_for_troubleshooting(normalized_results, request) + + return normalized_results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py new file mode 100644 index 000000000..c7a9e6a16 --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py @@ -0,0 +1,16 @@ +# These are all the possible fields on which the /recent_files_summary endpoint can aggregate by. +# Various flags modify the specifics, for experimentation, troubleshooting, and possible future changes. + +AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" +# FYI FWIW: There is also file_sets.libraries.analytes.samples.sample_sources.display_title; +# and that sometimes file_sets.libraries.analytes.samples.sample_sources.code does not exist. +AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" +AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" +AGGREGATION_FIELD_DONOR = "donors.display_title" +AGGREGATION_FIELD_FILE_DESCRIPTOR = "release_tracker_description" + +AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR = [ + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_DONOR +] diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py new file mode 100644 index 000000000..7fbc5d4ea --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -0,0 +1,504 @@ +from contextlib import contextmanager +from copy import deepcopy +from pyramid.request import Request as PyramidRequest +import re +from termcolor import colored +from typing import Any, Callable, List, Optional, Tuple, Union +from encoded.endpoints.endpoint_utils import parse_datetime_string +from encoded.endpoints.recent_files_summary.recent_files_summary_fields import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR) + + +def add_info_for_troubleshooting(normalized_results: dict, request: PyramidRequest) -> None: + + def get_files(files, property_name, property_value, map_property_value = None): + found = [] + for file in files: + if properties := _get_properties(file, property_name): + if callable(map_property_value): + mapped_properties = [] + for value in properties: + mapped_properties.append(map_property_value(value)) + properties = mapped_properties + if property_value in properties: + found.append(file) + return found + + def map_date_property_value(value): + if date_value := parse_datetime_string(value): + return f"{date_value.year}-{date_value.month:02}" + return value + + def count_uuid(uuid_records: List[dict], uuid: str) -> int: + count = 0 + for uuid_record in uuid_records: + if uuid_record.get("uuid") == uuid: + count += 1 + return count + + def dedup_list(data: list) -> list: # noqa + return list(dict.fromkeys(data)) if isinstance(data, list) else [] + + aggregation_fields_for_troubleshooting = dedup_list([ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + # Store some extra properties for troublehooting (as this whole thing is). + "file_sets.libraries.analytes.samples.sample_sources.display_title", + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ]) + + def annotate_with_uuids(normalized_results: dict): + nonlocal aggregation_fields_for_troubleshooting + uuid_records = [] + query = normalized_results.get("query") + if isinstance(normalized_results.get("debug"), dict): + normalized_results["debug"]["aggregation_fields_for_troubleshooting"] = ( + aggregation_fields_for_troubleshooting) + files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] + for first_item in normalized_results["items"]: + first_property_name = first_item["name"] + first_property_value = first_item["value"] + for second_item in first_item["items"]: + second_property_name = second_item["name"] + second_property_value = second_item["value"] + for third_item in second_item["items"]: + third_property_name = third_item["name"] + third_property_value = third_item["value"] + if debug_elasticsearch_hits := third_item.get("debug_elasticsearch_hits"): + if not third_item.get("debug"): + third_item["debug"] = {} + third_item["debug"]["elasticsearch_hits"] = debug_elasticsearch_hits + third_item["debug"]["elasticsearch_hits"].sort() + del third_item["debug_elasticsearch_hits"] + if first_files := get_files(files, first_property_name, first_property_value, + map_property_value=map_date_property_value): + if second_files := get_files(first_files, second_property_name, second_property_value): + if third_files := get_files(second_files, third_property_name, third_property_value): + for file in third_files: + if isinstance(uuid := file.get("uuid"), str): + if not third_item.get("debug"): + third_item["debug"] = {} + if not third_item["debug"].get("portal_hits"): + third_item["debug"]["portal_hits"] = [] + uuid_record = {"uuid": uuid} + for aggregation_field in aggregation_fields_for_troubleshooting: + aggregation_values = ", ".join(_get_properties(file, aggregation_field)) + uuid_record[aggregation_field] = aggregation_values or None + if third_item["debug"].get("elasticsearch_hits"): + uuid_record["elasticsearch_counted"] = \ + uuid in third_item["debug"]["elasticsearch_hits"] + third_item["debug"]["portal_hits"].append(uuid_record) + uuid_records.append(uuid_record) + if third_item.get("debug", {}).get("portal_hits"): + third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) + + for uuid_record in uuid_records: + if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: + uuid_record["duplicative"] = count + + try: + annotate_with_uuids(normalized_results) + except Exception: + pass + + +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): + with _capture_output_to_html_string() as captured_output: + print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) + return captured_output.html + + +def print_normalized_aggregation_results_for_troubleshooting(normalized_results: dict, + title: Optional[str] = None, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + uuids: bool = False, + uuid_details: bool = False, + nobold: bool = False, + checks: bool = False, + query: bool = False, + verbose: bool = False) -> None: + + """ + For deveopment/troubleshooting only ... + """ + def get_aggregation_fields(normalized_results: dict) -> List[str]: + # Returns all noted/important aggregation fields which ARE actually being used by the query; + # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + # which is all of the possible sample-source/cell-line/donor aggregations. + if not isinstance(aggregation_fields := + normalized_results.get("debug", {}).get("aggregation_query_fields"), list): + aggregation_fields = [] + else: + aggregation_fields = deepcopy(aggregation_fields) + for aggregation_field in aggregation_fields: + # Remove the ones we are not interested in reporting on. + if aggregation_field not in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + aggregation_fields.remove(aggregation_field) + return aggregation_fields + + def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: + aggregation_fields_to_print = get_aggregation_fields(normalized_results) + if isinstance(aggregation_fields_for_troubleshooting := + normalized_results.get("debug", {}).get("aggregation_fields_for_troubleshooting"), list): + for aggregation_field_for_troubleshooting in aggregation_fields_for_troubleshooting: + if aggregation_field_for_troubleshooting not in aggregation_fields_to_print: + aggregation_fields_to_print.append(aggregation_field_for_troubleshooting) + aggregation_fields_to_not_print = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + for aggregation_field_to_not_print in aggregation_fields_to_not_print: + if aggregation_field_to_not_print in aggregation_fields_to_print: + aggregation_fields_to_print.remove(aggregation_field_to_not_print) + return aggregation_fields_to_print + + def get_aggregation_field_labels() -> dict: + # Shorter/nicer names for aggregation fields of interest to print. + return { + AGGREGATION_FIELD_CELL_MIXTURE: "sample-sources", + AGGREGATION_FIELD_CELL_LINE: "cell-lines", + AGGREGATION_FIELD_DONOR: "donors", + "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" + } + + def terminal_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + # This is used only for troubleshooting by + if nocolor is True: + return value + attributes = [] + if dark is True: + attributes.append("dark") + if bold is True: + attributes.append("bold") + if underline is True: + attributes.append("underline") + if isinstance(color, str) and color: + return colored(value, color.lower(), attrs=attributes) + return colored(value, attrs=attributes) + + def print_results(data: dict, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + indent: int = 0) -> None: + + nonlocal title, uuids, uuid_details, nobold, query, verbose + nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green, green_bold, gray, bold + nonlocal aggregation_fields_to_print + + def get_portal_hits(data: dict) -> List[dict]: + hits = [] + if isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): + for portal_hit in portal_hits: + if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: + hits.append(portal_hit) + return hits + + def format_hit_property_values(hit: dict, property_name: str, + color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: + nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow + counted_elsewhere = [] + if hit.get("elasticsearch_counted") is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + else: + counted_grouping_name, counted_grouping_value = (None, None) + if property_value := hit.get(property_name): + if property_name == parent_grouping_name: + property_values = [] + for property_value in property_value.split(","): + if (property_value := property_value.strip()) == parent_grouping_value: + property_value = color(property_value) if callable(color) else green_bold(property_value) + property_values.append(property_value) + else: + if (counted_grouping_name, counted_grouping_value) == (property_name, property_value): + property_values.append(green_bold(f"{property_value} {chars_larrow_hollow}") + + green(" COUNTED HERE")) + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) + else: + property_values.append(property_value) + property_value = ", ".join(property_values) + elif hit.get("elasticsearch_counted") is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): + property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) + return property_value, counted_elsewhere + + def find_where_aggregated_and_counted( + uuid: str, + multiple: bool = False, + ignore: Optional[Union[List[Tuple[str, str]], + Tuple[str, str]]] = None) -> Union[Tuple[str, str], List[Tuple[str, str]]]: + + nonlocal normalized_results + + def find_where(data: dict, uuid: str, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None) -> List[Tuple[str, str]]: + found_uuid_grouping_names_and_values = set() + if isinstance(data, dict): + grouping_name = data.get("name") + grouping_value = data.get("value") + if isinstance(items := data.get("items"), list): + for item in items: + if found := find_where(item, uuid, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value): + found_uuid_grouping_names_and_values.update(found) + elif isinstance(hits := data.get("debug", {}).get("portal_hits"), list): + for hit in hits: + if hit.get("uuid") == uuid: + if hit.get("elasticsearch_counted") is True: + found_uuid_grouping_names_and_values.add((parent_grouping_name, parent_grouping_value)) + return found_uuid_grouping_names_and_values + + if found_uuid_grouping_names_and_values := list(find_where(normalized_results, uuid)): + if isinstance(ignore, tuple) and (len(ignore) == 2) and (ignore in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore) + elif isinstance(ignore, list): + for ignore_item in ignore: + if isinstance(ignore_item, tuple) and (len(ignore_item) == 2) and (ignore_item in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore_item) + if multiple is True: + return found_uuid_grouping_names_and_values + if len(found_uuid_grouping_names_and_values) > 1: + # Normally should only be at most one item with elasticsearch_counted set to True. + pass + return found_uuid_grouping_names_and_values[0] + return [(None, None)] if multiple is True else (None, None) + + def print_hit_property_values(hit: dict, property_name: str, + label: Optional[str] = None, + prefix: Optional[str] = None, + color: Optional[Callable] = None) -> List[Tuple[str, str]]: + nonlocal aggregation_fields, aggregation_field_labels, chars_dot_hollow, chars_null, verbose + if not label: + label = aggregation_field_labels.get(property_name) + if (verbose is True) or (not label): + label = property_name + property_values, counted_elsewhere = format_hit_property_values(hit, property_name, color=color) + if not property_values: + property_values = chars_null + if property_name not in aggregation_fields: + property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" + property_description = gray(property_description) + else: + property_description = f"{prefix or ''}{chars_dot} {label}: {property_values}" + print(property_description) + return counted_elsewhere + + if not (isinstance(data, dict) and data): + return + if not (isinstance(indent, int) and (indent > 0)): + indent = 0 + spaces = (" " * indent) if indent > 0 else "" + grouping_name = data.get("name") + if isinstance(grouping_value := data.get("value"), str) and grouping_value: + grouping = bold(grouping_value) + if (verbose is True) and isinstance(grouping_name, str) and grouping_name: + grouping = f"{grouping_name} {chars_dot} {grouping}" + elif not (isinstance(grouping := title, str) and grouping): + grouping = "RESULTS" + grouping = f"{chars_diamond} {grouping}" + hits = get_portal_hits(data) if (uuids is True) else [] + if isinstance(count := data.get("count"), int): + note = "" + if len(hits) > count: + note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + elif isinstance(items := data.get("items"), list): + subcount = 0 + for item in items: + if isinstance(subcount_item := item.get("count"), int): + subcount += subcount_item + if subcount != count: + note = red(f" {chars_xmark} ACTUAL COUNT: {subcount}") + elif checks is True: + note = f" {chars_check}" + elif checks: + note = f" {chars_check}" + print(f"{spaces}{grouping}: {count}{note}") + if (query is True) and (query_string := data.get("query")): + print(f"{spaces} {query_string}") + for hit in hits: + if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: + note = "" + if hit.get("elasticsearch_counted") is False: + print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) + color = red_bold + else: + print(f"{spaces} {chars_dot} {uuid} {chars_check}") + color = green_bold + if uuid_details is True: + prefix = f"{spaces} " + counted_elsewhere = [] + # Show property values for troubleshooting (as this whole thing is); + # see add_info_for_troubleshooting.annotate_with_uuids. + for aggregation_field in aggregation_fields_to_print: + hit_counted_elsewhere = \ + print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) + if hit_counted_elsewhere: + counted_elsewhere.extend(hit_counted_elsewhere) + # See if also grouped elsewhere for our FYI. + duplicative = hit.get("duplicative") + duplicates = duplicative - 1 if isinstance(duplicative, int) else 0 + counted_groupings = find_where_aggregated_and_counted( + hit.get("uuid"), multiple=True, + ignore=counted_elsewhere + [(parent_grouping_name, parent_grouping_value)]) + if counted_groupings: + message = f"{spaces} {green(chars_rarrow_hollow)} {green('ALSO COUNTED HERE')}:" + if verbose is True: + if duplicates > 0: + message += f" {duplicates}" + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} vs {len(counted_groupings)}") + print(message) + for counted_grouping in counted_groupings: + print(f"{spaces} - {counted_grouping[0]} {green(counted_grouping[1])}") + else: + counted_grouping_values = [green(counted_grouping[1]) for counted_grouping in counted_groupings] + message = f"{message} {', '.join(counted_grouping_values)}" + if duplicates > 0: + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} {duplicates} vs {len(counted_grouping_values)}") + print(message) + if isinstance(items := data.get("items"), list): + for element in items: + print_results(element, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value, + indent=indent + 2) + + aggregation_fields = get_aggregation_fields(normalized_results) + aggregation_fields_to_print = get_aggregation_fields_to_print(normalized_results) + aggregation_field_labels = get_aggregation_field_labels() + + red = lambda text: terminal_color(text, "red") # noqa + red_bold = lambda text: terminal_color(text, "red", bold=True) # noqa + green = lambda text: terminal_color(text, "green") # noqa + green_bold = lambda text: terminal_color(text, "green", bold=True) # noqa + gray = lambda text: terminal_color(text, "grey") # noqa + bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) + chars_check = "✓" + chars_xmark = "✗" + chars_dot = "•" + chars_dot_hollow = "◦" + chars_diamond = "❖" + chars_rarrow_hollow = "▷" + chars_larrow_hollow = "◁" + chars_null = "∅" + + print_results(normalized_results) + + +def _get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: bool = False) -> List[Any]: + """ + TODO: Move this to dcicutils. Maybe much of the above too. + Returns the values of the given property name within the given dictionary as a list, where the + given property name can be a dot-separated list of property names, which indicate a path into + nested dictionaries within the given dictionary; and - where if any of the elements within + the path are lists then we iterate through each, collecting the values for each and including + each within the list of returned values. + """ + if isinstance(data, dict) and isinstance(name, str) and name: + if keys := name.split("."): + nkeys = len(keys) ; key_index_max = nkeys - 1 # noqa + for key_index in range(nkeys): + if (value := data.get(keys[key_index], None)) is not None: + if key_index == key_index_max: + return [value] if not isinstance(value, list) else value + elif isinstance(value, dict): + data = value + continue + elif isinstance(value, list) and value and ((sub_key_index := key_index + 1) < nkeys): + sub_key = ".".join(keys[sub_key_index:]) + values = [] + for element in value: + if isinstance(element_value := _get_properties(element, sub_key), list): + for element_value_item in element_value: + if (element_value_item is not None) and (element_value_item not in values): + values.append(element_value_item) + elif (element_value is not None) and (element_value not in values): + values.append(element_value) + return sorted(values) if (sort is True) else values + break + return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) + + +@contextmanager +def _capture_output_to_html_string(): + + from io import StringIO + from unittest.mock import patch as patch + + def ansi_to_html(text): + ANSI_COLOR_MAP = { + "30": "black", + "31": "red", + "32": "green", + "33": "yellow", + "34": "blue", + "35": "magenta", + "36": "cyan", + "37": "white", + "90": "bright_black", + "91": "bright_red", + "92": "bright_green", + "93": "bright_yellow", + "94": "bright_blue", + "95": "bright_magenta", + "96": "bright_cyan", + "97": "bright_white", + } + ANSI_ESCAPE_RE = re.compile(r"\x1b\[([0-9;]*)m") + bold_active = False + def replace_ansi(match): # noqa + nonlocal bold_active + codes = match.group(1).split(";") # Split multiple codes (e.g., "1;31") + html_parts = [] + for code in codes: + if code == "1": # Bold + if not bold_active: # Activate bold + html_parts.append("") + bold_active = True + elif code in ANSI_COLOR_MAP: # Colors + color = ANSI_COLOR_MAP[code] + html_parts.append(f"") + elif code == "0": # Reset + if bold_active: + html_parts.append("") + bold_active = False + html_parts.append("") # Close color + return "".join(html_parts) + text_with_html = ANSI_ESCAPE_RE.sub(replace_ansi, text) + if bold_active: + text_with_html += "" + return f"
{text_with_html}
" + + print_original = print + captured_output = StringIO() + class CapturedOutput: # noqa + def __init__(self, captured_output: StringIO): + self._captured_output = captured_output + @property # noqa + def text(self): + return self._captured_output.getvalue() + @property # noqa + def html(self): + return ansi_to_html(self._captured_output.getvalue()) + def captured_print(*args, **kwargs): # noqa + nonlocal captured_output + print_original(*args, **kwargs, file=captured_output) + with patch("builtins.print", captured_print): + yield CapturedOutput(captured_output) diff --git a/src/encoded/recent_files_summary.py b/src/encoded/recent_files_summary.py deleted file mode 100644 index 06ecd3069..000000000 --- a/src/encoded/recent_files_summary.py +++ /dev/null @@ -1,930 +0,0 @@ -from contextlib import contextmanager -from copy import deepcopy -from pyramid.request import Request as PyramidRequest -import re -from typing import Callable, List, Optional, Tuple, Union -from dcicutils.misc_utils import normalize_spaces -from encoded.elasticsearch_utils import add_debugging_to_elasticsearch_aggregation_query -from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query -from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results -from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results -from encoded.elasticsearch_utils import prune_elasticsearch_aggregation_results -from encoded.elasticsearch_utils import sort_normalized_aggregation_results -from encoded.elasticsearch_utils import AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE -from encoded.endpoint_utils import create_query_string, deconstruct_query_string -from encoded.endpoint_utils import get_date_range_for_month, parse_date_range_related_arguments -from encoded.endpoint_utils import get_properties, parse_datetime_string -from encoded.endpoint_utils import request_arg, request_args, request_arg_bool, request_arg_int -from snovault.search.search import search as snovault_search -from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq - -QUERY_FILE_TYPES = ["OutputFile"] -QUERY_FILE_STATUSES = ["released"] -QUERY_FILE_CATEGORIES = ["!Quality Control"] -QUERY_RECENT_MONTHS = 3 -QUERY_INCLUDE_CURRENT_MONTH = True - -AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" -# FYI FWIW: There is also file_sets.libraries.analytes.samples.sample_sources.display_title; -# and that sometimes file_sets.libraries.analytes.samples.sample_sources.code does not exist. -AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" -AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" -AGGREGATION_FIELD_DONOR = "donors.display_title" -AGGREGATION_FIELD_FILE_DESCRIPTOR = "release_tracker_description" - -AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR = [ - AGGREGATION_FIELD_CELL_MIXTURE, - AGGREGATION_FIELD_CELL_LINE, - AGGREGATION_FIELD_DONOR -] - -BASE_SEARCH_QUERY = "/search/" - -def recent_files_summary(request: PyramidRequest, troubleshooting: bool = True) -> dict: - """ - This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, - by default, info for files released withing the past three months grouped by release-date, - cell-line or donor, and file-description. The specific fields used for these groupings are: - - - release-date: file_status_tracking.released - - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code - - donor: donors.display_title - - file-dsecription: release_tracker_description - - Note that release_tracker_description is a newer (2024-12) - calculated property - see PR-298 (branch: sn_file_release_tracker). - - By default the current (assuminging partial) month IS included, so we really return info for - the past FULL three months plus for whatever time has currently elapsed for the current month. - Use pass the include_current_month=false query argument to NOT include the current month. - - The number of months of data can be controlled using the nmonths query argument, e.g. nmonths=6. - - A specific date range can also be passed in e.g. using from_date=2024-08-01 and thru_date=2024-10-31. - - For testing purposes, a date field other than the default file_status_tracking.released can - also be specified using the date_property_name query argument. And file statuses other than - released can be queried for using one or more status query arguments, e.g. status=uploaded. - """ - - - global AGGREGATION_FIELD_RELEASE_DATE - - date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) - max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) - include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) - include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) - nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell", True)) # N.B. default True - nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) - nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) - favor_donor = request_arg_bool(request, "favor_donor") - multi = request_arg_bool(request, "multi") - nosort = request_arg_bool(request, "nosort") - legacy = request_arg_bool(request, "legacy") - debug = request_arg_bool(request, "debug") - debug_query = request_arg_bool(request, "debug_query") - troubleshoot = request_arg_bool(request, "troubleshoot") - troubleshoot_elasticsearch = request_arg_bool(request, "troubleshoot_elasticsearch") - raw = request_arg_bool(request, "raw") - willrfix = request_arg_bool(request, "willrfix") - - if troubleshooting is True: - debug = True - troubleshoot = True - troubleshoot_elasticsearch = True - - def get_aggregation_field_grouping_cell_or_donor() -> List[str]: - # This specializes the aggregation query to group first by the cell-line field, - # and then alternatively (if a cell-line field does not exist) by the donor field. - # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively - # look first for the donor field and then secondarily for the cell-line field. - global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR - nonlocal nocells, nomixtures, nodonors, favor_donor - aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) - if nocells: - aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_LINE) - if nomixtures: - aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_MIXTURE) - if nodonors: - aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) - if favor_donor: - aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) - aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) - return aggregation_field_grouping_cell_or_donor - - def create_base_query_arguments(request: PyramidRequest) -> dict: - - global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES - - types = request_args(request, "type", QUERY_FILE_TYPES) - statuses = request_args(request, "status", QUERY_FILE_STATUSES) - categories = request_args(request, "category", QUERY_FILE_CATEGORIES) - - base_query_arguments = { - "type": types if types else None, - "status": statuses if statuses else None, - "data_category": categories if categories else None - } - - return {key: value for key, value in base_query_arguments.items() if value is not None} - - def create_query_arguments(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: - - global BASE_SEARCH_QUERY, QUERY_RECENT_MONTHS, QUERY_INCLUDE_CURRENT_MONTH - nonlocal date_property_name - - recent_months = request_arg_int(request, "nmonths", request_arg_int(request, "months", QUERY_RECENT_MONTHS)) - from_date = request_arg(request, "from_date") - thru_date = request_arg(request, "thru_date") - include_current_month = request_arg_bool(request, "include_current_month", QUERY_INCLUDE_CURRENT_MONTH) - - from_date, thru_date = parse_date_range_related_arguments(from_date, thru_date, nmonths=recent_months, - include_current_month=include_current_month, - strings=True) - query_arguments = { - f"{date_property_name}.from": from_date if from_date else None, - f"{date_property_name}.to": thru_date if from_date else None - } - - if isinstance(base_query_arguments, dict): - query_arguments = {**base_query_arguments, **query_arguments} - return query_arguments - - def create_query(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: - query_arguments = create_query_arguments(request, base_query_arguments) - query_string = create_query_string(query_arguments) - return f"{BASE_SEARCH_QUERY}?{query_string}" - - def create_aggregation_query(aggregation_fields: List[str]) -> dict: - - nonlocal date_property_name, max_buckets, include_missing, favor_donor, troubleshoot_elasticsearch - - aggregations = [] - if not isinstance(aggregation_fields, list): - aggregation_fields = [aggregation_fields] - for item in aggregation_fields: - if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): - aggregations.append(item) - if not aggregations: - return {} - - def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, multi - if field == date_property_name: - return { - "date_histogram": { - "field": f"embedded.{field}", - "calendar_interval": "month", - "format": "yyyy-MM", - "missing": "1970-01", - "order": {"_key": "desc"} - } - } - elif field == AGGREGATION_FIELD_CELL_LINE: - # Note how we prefix the result with the aggregation field name; - # this is so later we can tell which grouping/field was matched; - # see fixup_names_values_for_normalized_results for this fixup. - script = "" - for aggregation_field_grouping_index in range(len(aggregation_field_grouping_cell_or_donor)): - aggregation_field = aggregation_field_grouping_cell_or_donor[aggregation_field_grouping_index] - if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" - # Note that if there are multiple values for the aggregation field just the "first" one will be chosen; - # where "first" means which was indexed first, which from an application POV is kind of arbitrary. - # If we want to make it more deterministic we could order the results (say) alphabetically like so: - # def value = doc['embedded.{aggregation_field}.raw'].stream().min((a, b) -> a.compareTo(b)).get(); - # return '{aggregation_field}:' + value; - # OR, if we actually want to aggregation on ALL values we could collect the results and return all like so: - # def values = []; - # for (value in doc['embedded.{aggregation_field}.raw']) { - # values.add('{aggregation_field}:' + value); - # } - # return values; - # But then we'd get double counting and so on. We are told in any case that these groups should be distinct. - if not multi: - script += f""" - {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ - return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; - }} - """ - else: - script += f""" - {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ - def values = []; - for (value in doc['embedded.{aggregation_field}.raw']) {{ - values.add('{aggregation_field}:' + value); - }} - return values; - }} - """ - script += f""" - else {{ - return 'unknown'; - }} - """ - return { - "terms": { - "script": { - "source": normalize_spaces(script), - "lang": "painless" - }, - "size": max_buckets - } - } - - def create_field_filter(field: str) -> Optional[dict]: # noqa - nonlocal aggregation_field_grouping_cell_or_donor - if field == AGGREGATION_FIELD_CELL_LINE: - filter = {"bool": {"should": [], "minimum_should_match": 1}} - for aggregation_field in aggregation_field_grouping_cell_or_donor: - filter["bool"]["should"].append({"exists": { "field": f"embedded.{aggregation_field}.raw"}}) - return filter - - aggregation_query = create_elasticsearch_aggregation_query( - aggregations, - max_buckets=max_buckets, - missing_value=AGGREGATION_NO_VALUE, - include_missing=include_missing, - create_field_aggregation=create_field_aggregation, - create_field_filter=create_field_filter) - - if troubleshoot_elasticsearch: - add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) - - return aggregation_query[date_property_name] - - def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: - - nonlocal date_property_name, max_buckets, include_missing - - aggregations = [] - if not isinstance(aggregation_fields, list): - aggregation_fields = [aggregation_fields] - for item in aggregation_fields: - if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): - aggregations.append(item) - if not aggregations: - return {} - - def create_field_aggregation(field: str) -> Optional[dict]: # noqa - nonlocal date_property_name - if field == date_property_name: - return { - "date_histogram": { - "field": f"embedded.{field}", - "calendar_interval": "month", - "format": "yyyy-MM", - "missing": "1970-01", - "order": {"_key": "desc"} - } - } - - aggregation_query = create_elasticsearch_aggregation_query( - aggregations, - max_buckets=max_buckets, - missing_value=AGGREGATION_NO_VALUE, - include_missing=include_missing, - create_field_aggregation=create_field_aggregation) - - if troubleshoot_elasticsearch: - add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) - - return aggregation_query[date_property_name] - - def execute_aggregation_query(request: PyramidRequest, query: str, aggregation_query: dict) -> str: - query += "&from=0&limit=0" # needed for aggregation query to not return the actual/individual item results. - request = snovault_make_search_subreq(request, path=query, method="GET") - results = snovault_search(None, request, custom_aggregations=aggregation_query) - return results - - def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: - nonlocal aggregation_field_grouping_cell_or_donor - if isinstance(normalized_results, dict): - if isinstance(value := normalized_results.get("value"), str): - if ((separator_index := value.find(":")) > 0) and (value_prefix := value[0:separator_index]): - if value_prefix in aggregation_field_grouping_cell_or_donor: - if value := value[separator_index + 1:]: - normalized_results["name"] = value_prefix - normalized_results["value"] = value - if isinstance(items := normalized_results.get("items"), list): - for element in items: - fixup_names_values_for_normalized_results(element) - - def add_queries_to_normalized_results(normalized_results: dict, base_query_arguments: dict) -> None: - global BASE_SEARCH_QUERY - nonlocal date_property_name, willrfix - if isinstance(normalized_results, dict): - if name := normalized_results.get("name"): - if value := normalized_results.get("value"): - if name == date_property_name: - # Special case for date value which is just year/month (e.g. 2024-12); - # we want to turn this into a date range query for the month; actually - # this is not a special case, this is the NORMAL case we are dealing with. - # from_date, thru_date = parse_date_range_related_arguments(value, None, nmonths=0, strings=True) - from_date, thru_date = get_date_range_for_month(value, strings=True) - if from_date and thru_date: - base_query_arguments = {**base_query_arguments, - f"{name}.from": from_date, f"{name}.to": thru_date} - else: - base_query_arguments = {**base_query_arguments, name: value} - if willrfix: - if name == AGGREGATION_FIELD_CELL_LINE: - base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE - elif name == AGGREGATION_FIELD_DONOR: - base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE - base_query_arguments[AGGREGATION_FIELD_CELL_LINE] = AGGREGATION_NO_VALUE - normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) - if isinstance(items := normalized_results.get("items"), list): - for element in items: - add_queries_to_normalized_results(element, base_query_arguments) - - aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() - # The base_query_arguments does not contain the from/thru dates as this is used; - # this is used to construct the query-string for the individually grouped items which - # will have the from/thru dates specifically representing their place within the group. - base_query_arguments = create_base_query_arguments(request) - query = create_query(request, base_query_arguments) - - if not legacy: - aggregate_by_cell_line_property_name = "aggregate_by_cell_line" - aggregate_by_cell_line = [ - date_property_name, - AGGREGATION_FIELD_CELL_LINE, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - aggregation_query = { - aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) - } - else: - aggregate_by_cell_line_property_name = "aggregate_by_cell_line" - aggregate_by_cell_line = [ - date_property_name, - AGGREGATION_FIELD_CELL_LINE, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - aggregate_by_donor_property_name = "aggregate_by_donor" - aggregate_by_donor = [ - date_property_name, - AGGREGATION_FIELD_DONOR, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - aggregation_query = { - aggregate_by_cell_line_property_name: create_aggregation_query_legacy(aggregate_by_cell_line), - aggregate_by_donor_property_name: create_aggregation_query_legacy(aggregate_by_donor) - } - - if debug_query: - return { - "query": query, - "query_arguments": deconstruct_query_string(query), - "aggregation_query_fields": [ - AGGREGATION_FIELD_RELEASE_DATE, - *get_aggregation_field_grouping_cell_or_donor(), - AGGREGATION_FIELD_FILE_DESCRIPTOR - ], - "aggregation_query": aggregation_query - } - - raw_results = execute_aggregation_query(request, query, aggregation_query) - - if raw: - # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. - # And note that unless we remove teh @id property we get redirected to the URL in this field, - # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control - # &file_status_tracking.released.from=2024-09-30 - # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' - if "@id" in raw_results: - del raw_results["@id"] - return raw_results - - if not (raw_results := raw_results.get("aggregations")): - return {} - - if debug: - raw_results = deepcopy(raw_results) # otherwise may be overwritten by below - - prune_elasticsearch_aggregation_results(raw_results) - - if not legacy: - aggregation_results = raw_results.get(aggregate_by_cell_line_property_name) - else: - aggregation_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), - raw_results.get(aggregate_by_donor_property_name)) - - # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, - # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 - # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), - # then its doc_count will NOT be counted TWICE. This creates a situation where it might LOOK like the counts - # are WRONG in the MERGED (via returned merge_elasticsearch_aggregation_results) result set, where the outer - # item count may be than the sum of the individual counts within each sub-group. For example, the below result - # shows a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other - # in the HG005 it would be because the same unique file has a cell_line.code of both HG00438 and HG005. - # { - # "meta": { "field_name": "file_status_tracking.released" }, - # "buckets": [ - # { - # "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 1, - # "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { - # "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, - # "buckets": [ - # { "key": "HG00438", "doc_count": 1, - # "release_tracker_description": { - # "meta": { "field_name": "release_tracker_description" }, - # "buckets": [ - # { "key": "WGS Illumina NovaSeq X bam", "doc_count": 1 }, - # ] - # } - # }, - # { "key": "HG005", "doc_count": 1, - # "release_tracker_description": { - # "meta": { "field_name": "release_tracker_description" }, - # "buckets": [ - # { "key": "Fiber-seq PacBio Revio bam", "doc_count": 1 } - # ] - # } - # } - # ] - # } - # } - # ] - # } - - if debug: - additional_properties = { - "debug": { - "query": query, - "query_arguments": deconstruct_query_string(query), - "aggregation_query_fields": [ - AGGREGATION_FIELD_RELEASE_DATE, - *get_aggregation_field_grouping_cell_or_donor(), - AGGREGATION_FIELD_FILE_DESCRIPTOR - ], - "aggregation_query": aggregation_query, - "raw_results": raw_results, - "aggregation_results": deepcopy(aggregation_results) - } - } - else: - additional_properties = None - - normalized_results = normalize_elasticsearch_aggregation_results(aggregation_results, - additional_properties=additional_properties, - remove_empty_items=not include_missing) - if not legacy: - fixup_names_values_for_normalized_results(normalized_results) - if include_queries: - add_queries_to_normalized_results(normalized_results, base_query_arguments) - normalized_results["query"] = query - - if not nosort: - # We can sort on the aggregations by level; outermost/left to innermost/right. - # In our case the outermost is the date aggregation so sort taht by the key value, - # e.g. 2014-12, descending; and the rest of the inner levels by the default - # sorting which is by aggregation count descending and secondarily by the key value. - sort_normalized_aggregation_results(normalized_results, ["-key", "default"]) - - if troubleshoot: - add_info_for_troubleshooting(normalized_results, request) - - return normalized_results - - -def add_info_for_troubleshooting(normalized_results: dict, request: PyramidRequest) -> None: - - def get_files(files, property_name, property_value, map_property_value = None): - found = [] - for file in files: - if properties := get_properties(file, property_name): - if callable(map_property_value): - mapped_properties = [] - for value in properties: - mapped_properties.append(map_property_value(value)) - properties = mapped_properties - if property_value in properties: - found.append(file) - return found - - def map_date_property_value(value): - if date_value := parse_datetime_string(value): - return f"{date_value.year}-{date_value.month:02}" - return value - - def count_uuid(uuid_records: List[dict], uuid: str) -> int: - count = 0 - for uuid_record in uuid_records: - if uuid_record.get("uuid") == uuid: - count += 1 - return count - - def dedup_list(data: list) -> list: # noqa - return list(dict.fromkeys(data)) if isinstance(data, list) else [] - - aggregation_fields_for_troubleshooting = dedup_list([ - AGGREGATION_FIELD_RELEASE_DATE, - AGGREGATION_FIELD_CELL_MIXTURE, - AGGREGATION_FIELD_CELL_LINE, - # Store some extra properties for troublehooting (as this whole thing is). -# "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.display_title", -# "file_sets.libraries.analytes.samples.sample_sources.components.cell_culture.cell_line.code", - "file_sets.libraries.analytes.samples.sample_sources.display_title", - AGGREGATION_FIELD_DONOR, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ]) - - def annotate_with_uuids(normalized_results: dict): - nonlocal aggregation_fields_for_troubleshooting - uuid_records = [] - query = normalized_results.get("query") - if isinstance(normalized_results.get("debug"), dict): - normalized_results["debug"]["aggregation_fields_for_troubleshooting"] = ( - aggregation_fields_for_troubleshooting) - files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] - for first_item in normalized_results["items"]: - first_property_name = first_item["name"] - first_property_value = first_item["value"] - for second_item in first_item["items"]: - second_property_name = second_item["name"] - second_property_value = second_item["value"] - for third_item in second_item["items"]: - third_property_name = third_item["name"] - third_property_value = third_item["value"] - if debug_elasticsearch_hits := third_item.get("debug_elasticsearch_hits"): - if not third_item.get("debug"): - third_item["debug"] = {} - third_item["debug"]["elasticsearch_hits"] = debug_elasticsearch_hits - third_item["debug"]["elasticsearch_hits"].sort() - del third_item["debug_elasticsearch_hits"] - if first_files := get_files(files, first_property_name, first_property_value, - map_property_value=map_date_property_value): - if second_files := get_files(first_files, second_property_name, second_property_value): - if third_files := get_files(second_files, third_property_name, third_property_value): - for file in third_files: - if isinstance(uuid := file.get("uuid"), str): - if not third_item.get("debug"): - third_item["debug"] = {} - if not third_item["debug"].get("portal_hits"): - third_item["debug"]["portal_hits"] = [] - uuid_record = {"uuid": uuid} - for aggregation_field in aggregation_fields_for_troubleshooting: - aggregation_values = ", ".join(get_properties(file, aggregation_field)) - uuid_record[aggregation_field] = aggregation_values or None - if third_item["debug"].get("elasticsearch_hits"): - uuid_record["elasticsearch_counted"] = \ - uuid in third_item["debug"]["elasticsearch_hits"] - third_item["debug"]["portal_hits"].append(uuid_record) - uuid_records.append(uuid_record) - if third_item.get("debug", {}).get("portal_hits"): - third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) - - for uuid_record in uuid_records: - if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: - uuid_record["duplicative"] = count - - try: - annotate_with_uuids(normalized_results) - except Exception: - pass - - -def print_normalized_aggregation_results(normalized_results: dict, - title: Optional[str] = None, - parent_grouping_name: Optional[str] = None, - parent_grouping_value: Optional[str] = None, - uuids: bool = False, - uuid_details: bool = False, - nobold: bool = False, - checks: bool = False, - query: bool = False, - verbose: bool = False) -> None: - - """ - For deveopment/troubleshooting only ... - """ - from encoded.endpoint_utils import terminal_color - - def get_aggregation_fields(normalized_results: dict) -> List[str]: - # Returns all noted/important aggregation fields which ARE actually being used by the query; - # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, - # which is all of the possible sample-source/cell-line/donor aggregations. - global AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR - if not isinstance(aggregation_fields := - normalized_results.get("debug", {}).get("aggregation_query_fields"), list): - aggregation_fields = [] - else: - aggregation_fields = deepcopy(aggregation_fields) - for aggregation_field in aggregation_fields: - # Remove the ones we are not interested in reporting on. - if aggregation_field not in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: - aggregation_fields.remove(aggregation_field) - return aggregation_fields - - def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: - aggregation_fields_to_print = get_aggregation_fields(normalized_results) - if isinstance(aggregation_fields_for_troubleshooting := - normalized_results.get("debug", {}).get("aggregation_fields_for_troubleshooting"), list): - for aggregation_field_for_troubleshooting in aggregation_fields_for_troubleshooting: - if aggregation_field_for_troubleshooting not in aggregation_fields_to_print: - aggregation_fields_to_print.append(aggregation_field_for_troubleshooting) - aggregation_fields_to_not_print = [ - AGGREGATION_FIELD_RELEASE_DATE, - AGGREGATION_FIELD_FILE_DESCRIPTOR - ] - for aggregation_field_to_not_print in aggregation_fields_to_not_print: - if aggregation_field_to_not_print in aggregation_fields_to_print: - aggregation_fields_to_print.remove(aggregation_field_to_not_print) - return aggregation_fields_to_print - - def get_aggregation_field_labels() -> dict: - # Shorter/nicer names for aggregation fields of interest to print. - global AGGREGATION_FIELD_CELL_MIXTURE, AGGREGATION_FIELD_CELL_LINE, AGGREGATION_FIELD_DONOR - return { - AGGREGATION_FIELD_CELL_MIXTURE: "sample-sources", - AGGREGATION_FIELD_CELL_LINE: "cell-lines", - AGGREGATION_FIELD_DONOR: "donors", - "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" - } - - def print_results(data: dict, - parent_grouping_name: Optional[str] = None, - parent_grouping_value: Optional[str] = None, - indent: int = 0) -> None: - - nonlocal title, uuids, uuid_details, nobold, query, verbose - nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green, green_bold, gray, bold - nonlocal aggregation_fields_to_print - - def get_portal_hits(data: dict) -> List[dict]: - hits = [] - if isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): - for portal_hit in portal_hits: - if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: - hits.append(portal_hit) - return hits - - def format_hit_property_values(hit: dict, property_name: str, - color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: - nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow - counted_elsewhere = [] - if hit.get("elasticsearch_counted") is False: - counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) - else: - counted_grouping_name, counted_grouping_value = (None, None) - if property_value := hit.get(property_name): - if property_name == parent_grouping_name: - property_values = [] - for property_value in property_value.split(","): - if (property_value := property_value.strip()) == parent_grouping_value: - property_value = color(property_value) if callable(color) else green_bold(property_value) - property_values.append(property_value) - else: - if (counted_grouping_name, counted_grouping_value) == (property_name, property_value): - property_values.append(green_bold(f"{property_value} {chars_larrow_hollow}") + - green(" COUNTED HERE")) - counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) - else: - property_values.append(property_value) - property_value = ", ".join(property_values) - elif hit.get("elasticsearch_counted") is False: - counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) - if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): - property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") - counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) - return property_value, counted_elsewhere - - def find_where_aggregated_and_counted( - uuid: str, - multiple: bool = False, - ignore: Optional[Union[List[Tuple[str, str]], - Tuple[str, str]]] = None) -> Union[Tuple[str, str], List[Tuple[str, str]]]: - - nonlocal normalized_results - - def find_where(data: dict, uuid: str, - parent_grouping_name: Optional[str] = None, - parent_grouping_value: Optional[str] = None) -> List[Tuple[str, str]]: - found_uuid_grouping_names_and_values = set() - if isinstance(data, dict): - grouping_name = data.get("name") - grouping_value = data.get("value") - if isinstance(items := data.get("items"), list): - for item in items: - if found := find_where(item, uuid, - parent_grouping_name=grouping_name, - parent_grouping_value=grouping_value): - found_uuid_grouping_names_and_values.update(found) - elif isinstance(hits := data.get("debug", {}).get("portal_hits"), list): - for hit in hits: - if hit.get("uuid") == uuid: - if hit.get("elasticsearch_counted") is True: - found_uuid_grouping_names_and_values.add((parent_grouping_name, parent_grouping_value)) - return found_uuid_grouping_names_and_values - - if found_uuid_grouping_names_and_values := list(find_where(normalized_results, uuid)): - if isinstance(ignore, tuple) and (len(ignore) == 2) and (ignore in found_uuid_grouping_names_and_values): - found_uuid_grouping_names_and_values.remove(ignore) - elif isinstance(ignore, list): - for ignore_item in ignore: - if isinstance(ignore_item, tuple) and (len(ignore_item) == 2) and (ignore_item in found_uuid_grouping_names_and_values): - found_uuid_grouping_names_and_values.remove(ignore_item) - if multiple is True: - return found_uuid_grouping_names_and_values - if len(found_uuid_grouping_names_and_values) > 1: - # Normally should only be at most one item with elasticsearch_counted set to True. - pass - return found_uuid_grouping_names_and_values[0] - return [(None, None)] if multiple is True else (None, None) - - def print_hit_property_values(hit: dict, property_name: str, - label: Optional[str] = None, - prefix: Optional[str] = None, - color: Optional[Callable] = None) -> List[Tuple[str, str]]: - nonlocal aggregation_fields, aggregation_field_labels, chars_dot_hollow, chars_null, verbose - if not label: - label = aggregation_field_labels.get(property_name) - if (verbose is True) or (not label): - label = property_name - property_values, counted_elsewhere = format_hit_property_values(hit, property_name, color=color) - if not property_values: - property_values = chars_null - if property_name not in aggregation_fields: - property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" - property_description = gray(property_description) - else: - property_description = f"{prefix or ''}{chars_dot} {label}: {property_values}" - print(property_description) - return counted_elsewhere - - if not (isinstance(data, dict) and data): - return - if not (isinstance(indent, int) and (indent > 0)): - indent = 0 - spaces = (" " * indent) if indent > 0 else "" - grouping_name = data.get("name") - if isinstance(grouping_value := data.get("value"), str) and grouping_value: - grouping = bold(grouping_value) - if (verbose is True) and isinstance(grouping_name, str) and grouping_name: - grouping = f"{grouping_name} {chars_dot} {grouping}" - elif not (isinstance(grouping := title, str) and grouping): - grouping = "RESULTS" - grouping = f"{chars_diamond} {grouping}" - hits = get_portal_hits(data) if (uuids is True) else [] - if isinstance(count := data.get("count"), int): - note = "" - if len(hits) > count: - note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") - elif isinstance(items := data.get("items"), list): - subcount = 0 - for item in items: - if isinstance(subcount_item := item.get("count"), int): - subcount += subcount_item - if subcount != count: - note = red(f" {chars_xmark} ACTUAL COUNT: {subcount}") - elif checks is True: - note = f" {chars_check}" - elif checks: - note = f" {chars_check}" - print(f"{spaces}{grouping}: {count}{note}") - if (query is True) and (query_string := data.get("query")): - print(f"{spaces} {query_string}") - for hit in hits: - if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: - note = "" - if hit.get("elasticsearch_counted") is False: - print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) - color = red_bold - else: - print(f"{spaces} {chars_dot} {uuid} {chars_check}") - color = green_bold - if uuid_details is True: - prefix = f"{spaces} " - counted_elsewhere = [] - # Show property values for troubleshooting (as this whole thing is); - # see add_info_for_troubleshooting.annotate_with_uuids. - for aggregation_field in aggregation_fields_to_print: - hit_counted_elsewhere = \ - print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) - if hit_counted_elsewhere: - counted_elsewhere.extend(hit_counted_elsewhere) - # See if also grouped elsewhere for our FYI. - duplicative = hit.get("duplicative") - duplicates = duplicative - 1 if isinstance(duplicative, int) else 0 - counted_groupings = find_where_aggregated_and_counted( - hit.get("uuid"), multiple=True, - ignore=counted_elsewhere + [(parent_grouping_name, parent_grouping_value)]) - if counted_groupings: - message = f"{spaces} {green(chars_rarrow_hollow)} {green('ALSO COUNTED HERE')}:" - if verbose is True: - if duplicates > 0: - message += f" {duplicates}" - if duplicates != len(counted_groupings): - message += red_bold(f" {chars_xmark} vs {len(counted_groupings)}") - print(message) - for counted_grouping in counted_groupings: - print(f"{spaces} - {counted_grouping[0]} {green(counted_grouping[1])}") - else: - counted_grouping_values = [green(counted_grouping[1]) for counted_grouping in counted_groupings] - message = f"{message} {', '.join(counted_grouping_values)}" - if duplicates > 0: - if duplicates != len(counted_groupings): - message += red_bold(f" {chars_xmark} {duplicates} vs {len(counted_grouping_values)}") - print(message) - if isinstance(items := data.get("items"), list): - for element in items: - print_results(element, - parent_grouping_name=grouping_name, - parent_grouping_value=grouping_value, - indent=indent + 2) - - aggregation_fields = get_aggregation_fields(normalized_results) - aggregation_fields_to_print = get_aggregation_fields_to_print(normalized_results) - aggregation_field_labels = get_aggregation_field_labels() - - red = lambda text: terminal_color(text, "red") # noqa - red_bold = lambda text: terminal_color(text, "red", bold=True) # noqa - green = lambda text: terminal_color(text, "green") # noqa - green_bold = lambda text: terminal_color(text, "green", bold=True) # noqa - gray = lambda text: terminal_color(text, "grey") # noqa - bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) - chars_check = "✓" - chars_xmark = "✗" - chars_dot = "•" - chars_dot_hollow = "◦" - chars_diamond = "❖" - chars_rarrow_hollow = "▷" - chars_larrow_hollow = "◁" - chars_null = "∅" - - print_results(normalized_results) - - -def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): - with capture_output_to_html_string() as captured_output: - print_normalized_aggregation_results(normalized_results, uuids=True, uuid_details=True) - return captured_output.html - return - - -@contextmanager -def capture_output_to_html_string(): - from io import StringIO - from unittest.mock import patch as patch - print_original = print - captured_output = StringIO() - class CapturedOutput: # noqa - def __init__(self, captured_output: StringIO): - self._captured_output = captured_output - @property # noqa - def text(self): - return self._captured_output.getvalue() - @property # noqa - def html(self): - return ansi_to_html(self._captured_output.getvalue()) - def captured_print(*args, **kwargs): # noqa - nonlocal captured_output - print_original(*args, **kwargs, file=captured_output) - with patch("builtins.print", captured_print): - yield CapturedOutput(captured_output) - - -def ansi_to_html(text): - ANSI_COLOR_MAP = { - '30': 'black', - '31': 'red', - '32': 'green', - '33': 'yellow', - '34': 'blue', - '35': 'magenta', - '36': 'cyan', - '37': 'white', - '90': 'bright_black', - '91': 'bright_red', - '92': 'bright_green', - '93': 'bright_yellow', - '94': 'bright_blue', - '95': 'bright_magenta', - '96': 'bright_cyan', - '97': 'bright_white', - } - ANSI_ESCAPE_RE = re.compile(r'\x1b\[([0-9;]*)m') - bold_active = False - def replace_ansi(match): # noqa - nonlocal bold_active - codes = match.group(1).split(';') # Split multiple codes (e.g., "1;31") - html_parts = [] - for code in codes: - if code == '1': # Bold - if not bold_active: # Activate bold - html_parts.append('') - bold_active = True - elif code in ANSI_COLOR_MAP: # Colors - color = ANSI_COLOR_MAP[code] - html_parts.append(f'') - elif code == '0': # Reset - if bold_active: - html_parts.append('') - bold_active = False - html_parts.append('') # Close color - return ''.join(html_parts) - text_with_html = ANSI_ESCAPE_RE.sub(replace_ansi, text) - if bold_active: - text_with_html += '' - return f'
{text_with_html}
' diff --git a/src/encoded/tests/test_elasticsearch_utils.py b/src/encoded/tests/test_elasticsearch_utils.py index 97d690500..979d13272 100644 --- a/src/encoded/tests/test_elasticsearch_utils.py +++ b/src/encoded/tests/test_elasticsearch_utils.py @@ -1,11 +1,13 @@ import pytest from typing import Optional -from encoded.elasticsearch_utils import create_elasticsearch_aggregation_query -from encoded.elasticsearch_utils import merge_elasticsearch_aggregation_results -from encoded.elasticsearch_utils import normalize_elasticsearch_aggregation_results -from encoded.recent_files_summary import AGGREGATION_FIELD_RELEASE_DATE -from encoded.recent_files_summary import AGGREGATION_FIELD_CELL_LINE -from encoded.recent_files_summary import AGGREGATION_FIELD_FILE_DESCRIPTOR +from encoded.endpoints.elasticsearch_utils import ( + create_elasticsearch_aggregation_query, + merge_elasticsearch_aggregation_results, + normalize_elasticsearch_aggregation_results) +from encoded.endpoints.recent_files_summary.recent_files_summary import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR) def test_create_elasticsearch_aggregation_query_a(): diff --git a/src/encoded/tests/test_endpoint_utils.py b/src/encoded/tests/test_endpoint_utils.py index b877a8b62..8b30634d1 100644 --- a/src/encoded/tests/test_endpoint_utils.py +++ b/src/encoded/tests/test_endpoint_utils.py @@ -2,7 +2,7 @@ import datetime from typing import Optional, Union from unittest.mock import patch as mock_patch -from encoded.endpoint_utils import parse_date_range_related_arguments, parse_datetime_string +from encoded.endpoints.endpoint_utils import parse_date_range_related_arguments, parse_datetime_string DEFAULT_MOCK_DATETIME_TODAY_VALUE = "2024-11-06 07:54:16" @@ -74,5 +74,6 @@ class MockDateTime(datetime.datetime): # noqa @classmethod def today(cls): nonlocal value ; return value # noqa - with (mock_patch("encoded.endpoint_utils.datetime", MockDateTime), mock_patch("datetime.datetime", MockDateTime)): + with (mock_patch("encoded.endpoints.endpoint_utils.datetime", MockDateTime), + mock_patch("datetime.datetime", MockDateTime)): yield From db2cc99259bab5f0dcc9c618a3684549c19e2b13 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 17:43:57 -0500 Subject: [PATCH 62/78] refactoring /recent_files_summary endpoint --- .../endpoints/recent_files_summary/recent_files_summary.py | 3 ++- .../recent_files_summary_troubleshooting.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py index f90d577c6..74903ef62 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -41,7 +41,8 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: - results = get_normalized_aggregation_results_as_html_for_troublehshooting(results) + text_debug = request_arg_bool(request, "text_debug") + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, debug=text_debug) results = PyramidResponse(f"
{results}
", content_type='text/html') return results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 7fbc5d4ea..111e68f3e 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -109,9 +109,11 @@ def annotate_with_uuids(normalized_results: dict): pass -def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, debug: bool = False): with _capture_output_to_html_string() as captured_output: print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) + if debug is True: + return captured_output.text return captured_output.html From e8280d6d25fee5c1cd16d0bd709a74907d691b46 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 18:09:28 -0500 Subject: [PATCH 63/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 111e68f3e..f1df59ff3 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -1,9 +1,12 @@ +import builtins from contextlib import contextmanager from copy import deepcopy -from pyramid.request import Request as PyramidRequest import re +from pyramid.request import Request as PyramidRequest +from io import StringIO from termcolor import colored from typing import Any, Callable, List, Optional, Tuple, Union +from unittest.mock import patch as patch from encoded.endpoints.endpoint_utils import parse_datetime_string from encoded.endpoints.recent_files_summary.recent_files_summary_fields import ( AGGREGATION_FIELD_RELEASE_DATE, @@ -13,7 +16,6 @@ AGGREGATION_FIELD_DONOR, AGGREGATION_FIELD_FILE_DESCRIPTOR) - def add_info_for_troubleshooting(normalized_results: dict, request: PyramidRequest) -> None: def get_files(files, property_name, property_value, map_property_value = None): @@ -441,9 +443,6 @@ def _get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: @contextmanager def _capture_output_to_html_string(): - from io import StringIO - from unittest.mock import patch as patch - def ansi_to_html(text): ANSI_COLOR_MAP = { "30": "black", @@ -488,7 +487,6 @@ def replace_ansi(match): # noqa text_with_html += "" return f"
{text_with_html}
" - print_original = print captured_output = StringIO() class CapturedOutput: # noqa def __init__(self, captured_output: StringIO): @@ -502,5 +500,9 @@ def html(self): def captured_print(*args, **kwargs): # noqa nonlocal captured_output print_original(*args, **kwargs, file=captured_output) - with patch("builtins.print", captured_print): - yield CapturedOutput(captured_output) + print_original = builtins.print + try: + with patch("builtins.print", captured_print): + yield CapturedOutput(captured_output) + finally: + print = print_original From 3a1f19f8aacd94a988b3290228d643f4f107b5df Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 18:14:23 -0500 Subject: [PATCH 64/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index f1df59ff3..d3c9d4db4 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -488,6 +488,7 @@ def replace_ansi(match): # noqa return f"
{text_with_html}
" captured_output = StringIO() + print_original = builtins.print class CapturedOutput: # noqa def __init__(self, captured_output: StringIO): self._captured_output = captured_output @@ -498,11 +499,7 @@ def text(self): def html(self): return ansi_to_html(self._captured_output.getvalue()) def captured_print(*args, **kwargs): # noqa - nonlocal captured_output + nonlocal captured_output, print_original print_original(*args, **kwargs, file=captured_output) - print_original = builtins.print - try: - with patch("builtins.print", captured_print): - yield CapturedOutput(captured_output) - finally: - print = print_original + with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): + yield CapturedOutput(captured_output) From a383ceb02a0e1e6f30d0f708e4a3a9666abf726a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 19:08:43 -0500 Subject: [PATCH 65/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index d3c9d4db4..d796a1527 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -487,19 +487,25 @@ def replace_ansi(match): # noqa text_with_html += "" return f"
{text_with_html}
" - captured_output = StringIO() - print_original = builtins.print + #captured_output = StringIO() + captured_output = "" + # print_original = builtins.print class CapturedOutput: # noqa def __init__(self, captured_output: StringIO): self._captured_output = captured_output @property # noqa def text(self): - return self._captured_output.getvalue() + return captured_output + # return self._captured_output.getvalue() @property # noqa def html(self): - return ansi_to_html(self._captured_output.getvalue()) + return ansi_to_html(captured_output) + # return ansi_to_html(self._captured_output.getvalue()) def captured_print(*args, **kwargs): # noqa - nonlocal captured_output, print_original - print_original(*args, **kwargs, file=captured_output) + # nonlocal captured_output, print_original + # print_original(*args, **kwargs, file=captured_output) + nonlocal captured_output + captured_output += str(args[0]) + captured_output += "\n" with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): yield CapturedOutput(captured_output) From 847aaa5f500ca2bd8659442984ad9c422de82301 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 19:10:12 -0500 Subject: [PATCH 66/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index d796a1527..17b9e763c 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -495,10 +495,12 @@ def __init__(self, captured_output: StringIO): self._captured_output = captured_output @property # noqa def text(self): + nonlocal captured_output return captured_output # return self._captured_output.getvalue() @property # noqa def html(self): + nonlocal captured_output return ansi_to_html(captured_output) # return ansi_to_html(self._captured_output.getvalue()) def captured_print(*args, **kwargs): # noqa From a092124fc3f74edf27a8dc80134952af165c0526 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 19:24:04 -0500 Subject: [PATCH 67/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 17b9e763c..3c28ae648 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -173,26 +173,6 @@ def get_aggregation_field_labels() -> dict: "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" } - def terminal_color(value: str, - color: Optional[str] = None, - dark: bool = False, - bold: bool = False, - underline: bool = False, - nocolor: bool = False) -> str: - # This is used only for troubleshooting by - if nocolor is True: - return value - attributes = [] - if dark is True: - attributes.append("dark") - if bold is True: - attributes.append("bold") - if underline is True: - attributes.append("underline") - if isinstance(color, str) and color: - return colored(value, color.lower(), attrs=attributes) - return colored(value, attrs=attributes) - def print_results(data: dict, parent_grouping_name: Optional[str] = None, parent_grouping_value: Optional[str] = None, @@ -388,12 +368,12 @@ def print_hit_property_values(hit: dict, property_name: str, aggregation_fields_to_print = get_aggregation_fields_to_print(normalized_results) aggregation_field_labels = get_aggregation_field_labels() - red = lambda text: terminal_color(text, "red") # noqa - red_bold = lambda text: terminal_color(text, "red", bold=True) # noqa - green = lambda text: terminal_color(text, "green") # noqa - green_bold = lambda text: terminal_color(text, "green", bold=True) # noqa - gray = lambda text: terminal_color(text, "grey") # noqa - bold = (lambda text: terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) + red = lambda text: _terminal_color(text, "red") # noqa + red_bold = lambda text: _terminal_color(text, "red", bold=True) # noqa + green = lambda text: _terminal_color(text, "green") # noqa + green_bold = lambda text: _terminal_color(text, "green", bold=True) # noqa + gray = lambda text: _terminal_color(text, "grey") # noqa + bold = (lambda text: _terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) chars_check = "✓" chars_xmark = "✗" chars_dot = "•" @@ -440,6 +420,27 @@ def _get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) +def _terminal_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + # This is used only for troubleshooting by + if nocolor is True: + return value + attributes = [] + if dark is True: + attributes.append("dark") + if bold is True: + attributes.append("bold") + if underline is True: + attributes.append("underline") + if isinstance(color, str) and color: + return colored(value, color.lower(), attrs=attributes) + return colored(value, attrs=attributes) + + @contextmanager def _capture_output_to_html_string(): @@ -507,7 +508,7 @@ def captured_print(*args, **kwargs): # noqa # nonlocal captured_output, print_original # print_original(*args, **kwargs, file=captured_output) nonlocal captured_output - captured_output += str(args[0]) + captured_output += str(args[0]) + "[" + _terminal_color("DEBUG", "red") + "]" captured_output += "\n" with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): yield CapturedOutput(captured_output) From 8e534f4a2bf4429d87680c65a72cddc808f0426d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 19:42:30 -0500 Subject: [PATCH 68/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 3c28ae648..2c73c40c2 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -508,7 +508,7 @@ def captured_print(*args, **kwargs): # noqa # nonlocal captured_output, print_original # print_original(*args, **kwargs, file=captured_output) nonlocal captured_output - captured_output += str(args[0]) + "[" + _terminal_color("DEBUG", "red") + "]" + captured_output += str(args[0]) + "[" + colored("DEBUG", "red") + "]" captured_output += "\n" with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): yield CapturedOutput(captured_output) From fb7ba9be4514ec5f87ff4a38b4fb1227889488b5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 19:59:49 -0500 Subject: [PATCH 69/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 2c73c40c2..1292a5142 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -4,6 +4,7 @@ import re from pyramid.request import Request as PyramidRequest from io import StringIO +import os from termcolor import colored from typing import Any, Callable, List, Optional, Tuple, Union from unittest.mock import patch as patch @@ -112,6 +113,7 @@ def annotate_with_uuids(normalized_results: dict): def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, debug: bool = False): + os.environ["TERM"] = "xterm-256color" with _capture_output_to_html_string() as captured_output: print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) if debug is True: @@ -508,6 +510,7 @@ def captured_print(*args, **kwargs): # noqa # nonlocal captured_output, print_original # print_original(*args, **kwargs, file=captured_output) nonlocal captured_output + os.environ["TERM"] = "xterm-256color" captured_output += str(args[0]) + "[" + colored("DEBUG", "red") + "]" captured_output += "\n" with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): From 57af5b1e01df7559fcfcc6185c37a34e4f348426 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 20:20:57 -0500 Subject: [PATCH 70/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 1292a5142..32e729635 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -422,12 +422,23 @@ def _get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) +def colored_html(value: str, color: Optional[str] = None, attrs: Optional[list] = None) -> str: + if isinstance(value, str): + if isinstance(color, str) and color: + value = f"{value}" + if isinstance(attrs, list): + if "bold" in attrs: + value = f"{value}" + return value + + def _terminal_color(value: str, color: Optional[str] = None, dark: bool = False, bold: bool = False, underline: bool = False, nocolor: bool = False) -> str: + colored = colored_html # This is used only for troubleshooting by if nocolor is True: return value From 4a212d1384a5c0d5d4dad341062b1adc00267949 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 21:15:25 -0500 Subject: [PATCH 71/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary.py | 3 +- .../recent_files_summary_troubleshooting.py | 101 +++++------------- 2 files changed, 28 insertions(+), 76 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py index 74903ef62..f90d577c6 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -41,8 +41,7 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: - text_debug = request_arg_bool(request, "text_debug") - results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, debug=text_debug) + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results) results = PyramidResponse(f"
{results}
", content_type='text/html') return results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 32e729635..31df69c37 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -1,10 +1,6 @@ -import builtins from contextlib import contextmanager from copy import deepcopy -import re from pyramid.request import Request as PyramidRequest -from io import StringIO -import os from termcolor import colored from typing import Any, Callable, List, Optional, Tuple, Union from unittest.mock import patch as patch @@ -112,13 +108,10 @@ def annotate_with_uuids(normalized_results: dict): pass -def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, debug: bool = False): - os.environ["TERM"] = "xterm-256color" - with _capture_output_to_html_string() as captured_output: +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): + with _capture_output_to_html() as captured_output: print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) - if debug is True: - return captured_output.text - return captured_output.html + return captured_output.text def print_normalized_aggregation_results_for_troubleshooting(normalized_results: dict, @@ -438,7 +431,6 @@ def _terminal_color(value: str, bold: bool = False, underline: bool = False, nocolor: bool = False) -> str: - colored = colored_html # This is used only for troubleshooting by if nocolor is True: return value @@ -455,74 +447,35 @@ def _terminal_color(value: str, @contextmanager -def _capture_output_to_html_string(): - - def ansi_to_html(text): - ANSI_COLOR_MAP = { - "30": "black", - "31": "red", - "32": "green", - "33": "yellow", - "34": "blue", - "35": "magenta", - "36": "cyan", - "37": "white", - "90": "bright_black", - "91": "bright_red", - "92": "bright_green", - "93": "bright_yellow", - "94": "bright_blue", - "95": "bright_magenta", - "96": "bright_cyan", - "97": "bright_white", - } - ANSI_ESCAPE_RE = re.compile(r"\x1b\[([0-9;]*)m") - bold_active = False - def replace_ansi(match): # noqa - nonlocal bold_active - codes = match.group(1).split(";") # Split multiple codes (e.g., "1;31") - html_parts = [] - for code in codes: - if code == "1": # Bold - if not bold_active: # Activate bold - html_parts.append("") - bold_active = True - elif code in ANSI_COLOR_MAP: # Colors - color = ANSI_COLOR_MAP[code] - html_parts.append(f"") - elif code == "0": # Reset - if bold_active: - html_parts.append("") - bold_active = False - html_parts.append("") # Close color - return "".join(html_parts) - text_with_html = ANSI_ESCAPE_RE.sub(replace_ansi, text) - if bold_active: - text_with_html += "" - return f"
{text_with_html}
" - - #captured_output = StringIO() +def _capture_output_to_html(): + + def html_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + if (nocolor is not True) and isinstance(value, str): + if isinstance(color, str) and color: + if dark is True: + value = f"{value}" + else: + value = f"{value}" + if bold is True: + value = f"{value}" + if underline is True: + value = f"{value}" + return value + captured_output = "" - # print_original = builtins.print class CapturedOutput: # noqa - def __init__(self, captured_output: StringIO): - self._captured_output = captured_output @property # noqa def text(self): nonlocal captured_output return captured_output - # return self._captured_output.getvalue() - @property # noqa - def html(self): - nonlocal captured_output - return ansi_to_html(captured_output) - # return ansi_to_html(self._captured_output.getvalue()) def captured_print(*args, **kwargs): # noqa - # nonlocal captured_output, print_original - # print_original(*args, **kwargs, file=captured_output) nonlocal captured_output - os.environ["TERM"] = "xterm-256color" - captured_output += str(args[0]) + "[" + colored("DEBUG", "red") + "]" - captured_output += "\n" - with patch("encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting.print", captured_print): - yield CapturedOutput(captured_output) + captured_output += str(args[0]) + "\n" + this_module = "encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting" + with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", html_color)): + yield CapturedOutput() From bbddc5ce77c6a98abe223ea9a2842a1a0a38447e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 21:24:01 -0500 Subject: [PATCH 72/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary/recent_files_summary.py | 3 ++- .../recent_files_summary_troubleshooting.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py index f90d577c6..74903ef62 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -41,7 +41,8 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: - results = get_normalized_aggregation_results_as_html_for_troublehshooting(results) + text_debug = request_arg_bool(request, "text_debug") + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, debug=text_debug) results = PyramidResponse(f"
{results}
", content_type='text/html') return results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 31df69c37..f4796f0a0 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -108,8 +108,8 @@ def annotate_with_uuids(normalized_results: dict): pass -def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict): - with _capture_output_to_html() as captured_output: +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, debug: bool = False): + with _capture_output_to_html(debug=debug) as captured_output: print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) return captured_output.text @@ -447,7 +447,7 @@ def _terminal_color(value: str, @contextmanager -def _capture_output_to_html(): +def _capture_output_to_html(debug: bool = False): def html_color(value: str, color: Optional[str] = None, @@ -477,5 +477,9 @@ def captured_print(*args, **kwargs): # noqa nonlocal captured_output captured_output += str(args[0]) + "\n" this_module = "encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting" - with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", html_color)): - yield CapturedOutput() + if debug is True: + with patch(f"{this_module}.print", captured_print): + yield CapturedOutput() + else: + with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", html_color)): + yield CapturedOutput() From 142626bd6c4e00aaa701195f8cd1fbc34df07d83 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 15 Dec 2024 21:32:02 -0500 Subject: [PATCH 73/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary/recent_files_summary.py | 9 ++++++++- .../recent_files_summary_troubleshooting.py | 11 +++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py index 74903ef62..7b9d712ee 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -41,8 +41,15 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: + text_verbose = request_arg_bool(request, "text_verbose") + text_uuids = request_arg_bool(request, "text_uuids", True) + text_uuid_details = request_arg_bool(request, "text_uuid_details", True) text_debug = request_arg_bool(request, "text_debug") - results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, debug=text_debug) + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, + uuids=text_uuids, + uuid_details=text_uuid_details, + verbose=text_verbose, + debug=text_debug) results = PyramidResponse(f"
{results}
", content_type='text/html') return results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index f4796f0a0..dfe0d06f4 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -108,9 +108,16 @@ def annotate_with_uuids(normalized_results: dict): pass -def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, debug: bool = False): +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, + uuids: bool = True, + uuid_details: bool = True, + verbose: bool = False, + debug: bool = False): with _capture_output_to_html(debug=debug) as captured_output: - print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=True, uuid_details=True) + print_normalized_aggregation_results_for_troubleshooting(normalized_results, + uuids=uuids, + uuid_details=uuid_details, + verbose=verbose) return captured_output.text From dde4db5fc2acc77c0d2bffa90ad019bca70e7537 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 16 Dec 2024 12:14:51 -0500 Subject: [PATCH 74/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary.py | 4 +- .../recent_files_summary_troubleshooting.py | 84 ++++++++++++++----- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py index 7b9d712ee..4c72af2ab 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -41,13 +41,15 @@ def recent_files_summary_endpoint(context, request): text = request_arg_bool(request, "text") results = recent_files_summary(request, troubleshooting=text) if text: - text_verbose = request_arg_bool(request, "text_verbose") text_uuids = request_arg_bool(request, "text_uuids", True) text_uuid_details = request_arg_bool(request, "text_uuid_details", True) + text_query = request_arg_bool(request, "text_query") + text_verbose = request_arg_bool(request, "text_verbose") text_debug = request_arg_bool(request, "text_debug") results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, uuids=text_uuids, uuid_details=text_uuid_details, + query=text_query, verbose=text_verbose, debug=text_debug) results = PyramidResponse(f"
{results}
", content_type='text/html') diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index dfe0d06f4..3ed42724e 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -54,7 +54,12 @@ def dedup_list(data: list) -> list: # noqa ]) def annotate_with_uuids(normalized_results: dict): + + def get_unique_release_tracker_description_values(normalized_results: dict) -> List[str]: + return _get_properties(normalized_results, "items.items.items.value") + nonlocal aggregation_fields_for_troubleshooting + unique_release_tracker_description_values = get_unique_release_tracker_description_values(normalized_results) uuid_records = [] query = normalized_results.get("query") if isinstance(normalized_results.get("debug"), dict): @@ -67,7 +72,19 @@ def annotate_with_uuids(normalized_results: dict): for second_item in first_item["items"]: second_property_name = second_item["name"] second_property_value = second_item["value"] - for third_item in second_item["items"]: + second_item_items = second_item["items"] + # Put dummy elements in for AGGREGATION_FIELD_FILE_DESCRIPTOR items values which do not exist. + third_item_values = [third_item["value"] for third_item in second_item_items] + for unique_release_tracker_description_value in unique_release_tracker_description_values: + if unique_release_tracker_description_value not in third_item_values: + second_item["items"].append({ + "name": AGGREGATION_FIELD_FILE_DESCRIPTOR, + "value": unique_release_tracker_description_value, + "count": 0, + "debug_placeholder": True + }) + third_items_to_delete = [] + for third_item in second_item_items: third_property_name = third_item["name"] third_property_value = third_item["value"] if debug_elasticsearch_hits := third_item.get("debug_elasticsearch_hits"): @@ -97,6 +114,12 @@ def annotate_with_uuids(normalized_results: dict): uuid_records.append(uuid_record) if third_item.get("debug", {}).get("portal_hits"): third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) + if ((third_item.get("count") == 0) and (third_item.get("debug_placeholder") is True) and + (not third_item.get("debug", {}).get("elasticsearch_hits")) and (not third_item.get("debug", {}).get("portal_hits"))): + third_items_to_delete.append(third_item) + if third_items_to_delete: + for third_item in third_items_to_delete: + second_item_items.remove(third_item) for uuid_record in uuid_records: if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: @@ -111,12 +134,14 @@ def annotate_with_uuids(normalized_results: dict): def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, uuids: bool = True, uuid_details: bool = True, + query: bool = False, verbose: bool = False, debug: bool = False): with _capture_output_to_html(debug=debug) as captured_output: print_normalized_aggregation_results_for_troubleshooting(normalized_results, uuids=uuids, uuid_details=uuid_details, + query=query, verbose=verbose) return captured_output.text @@ -186,7 +211,7 @@ def print_results(data: dict, def get_portal_hits(data: dict) -> List[dict]: hits = [] - if isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): + if isinstance(data, dict) and isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): for portal_hit in portal_hits: if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: hits.append(portal_hit) @@ -314,9 +339,21 @@ def print_hit_property_values(hit: dict, property_name: str, note = f" {chars_check}" elif checks: note = f" {chars_check}" + if not ((count == 0) and (len(hits) == 0) and (not note)): + if ((grouping_name in aggregation_fields) and + (len(hits) == 0) and isinstance(items := data.get("items"), list)): + # Count the actual hits for this noted aggregation field group. + items_nhits = 0 + for item in items: + items_nhits += len(get_portal_hits(item)) + if items_nhits > count: + note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") print(f"{spaces}{grouping}: {count}{note}") - if (query is True) and (query_string := data.get("query")): - print(f"{spaces} {query_string}") + if (query is True) and (query_string := data.get("query")): + if _terminal_color == _html_color: + print(f"{spaces} {query_string}") + else: + print(f"{spaces} {query_string}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: note = "" @@ -453,27 +490,28 @@ def _terminal_color(value: str, return colored(value, attrs=attributes) +def _html_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + if (nocolor is not True) and isinstance(value, str): + if isinstance(color, str) and color: + if dark is True: + value = f"{value}" + else: + value = f"{value}" + if bold is True: + value = f"{value}" + if underline is True: + value = f"{value}" + return value + + @contextmanager def _capture_output_to_html(debug: bool = False): - def html_color(value: str, - color: Optional[str] = None, - dark: bool = False, - bold: bool = False, - underline: bool = False, - nocolor: bool = False) -> str: - if (nocolor is not True) and isinstance(value, str): - if isinstance(color, str) and color: - if dark is True: - value = f"{value}" - else: - value = f"{value}" - if bold is True: - value = f"{value}" - if underline is True: - value = f"{value}" - return value - captured_output = "" class CapturedOutput: # noqa @property # noqa @@ -488,5 +526,5 @@ def captured_print(*args, **kwargs): # noqa with patch(f"{this_module}.print", captured_print): yield CapturedOutput() else: - with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", html_color)): + with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", _html_color)): yield CapturedOutput() From 397aeb1fecc5641982651f3e871481f074ee7f7a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 16 Dec 2024 12:47:43 -0500 Subject: [PATCH 75/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 3ed42724e..2ff86d486 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -114,8 +114,10 @@ def get_unique_release_tracker_description_values(normalized_results: dict) -> L uuid_records.append(uuid_record) if third_item.get("debug", {}).get("portal_hits"): third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) - if ((third_item.get("count") == 0) and (third_item.get("debug_placeholder") is True) and - (not third_item.get("debug", {}).get("elasticsearch_hits")) and (not third_item.get("debug", {}).get("portal_hits"))): + if ((third_item.get("count") == 0) and + (third_item.get("debug_placeholder") is True) and + (not third_item.get("debug", {}).get("elasticsearch_hits")) and + (not third_item.get("debug", {}).get("portal_hits"))): # noqa third_items_to_delete.append(third_item) if third_items_to_delete: for third_item in third_items_to_delete: @@ -217,6 +219,24 @@ def get_portal_hits(data: dict) -> List[dict]: hits.append(portal_hit) return hits + def count_unique_portal_hits_recursively(data: dict) -> int: + def get_portal_hits_recursively(data: dict) -> List[dict]: # noqa + hits = [] + if isinstance(data, dict): + for key in data: + if key == "portal_hits": + if isinstance(data[key], list): + hits.extend(data[key]) + else: + hits.extend(get_portal_hits_recursively(data[key])) + elif isinstance(data, list): + for element in data: + hits.extend(get_portal_hits_recursively(element)) + return hits + hits = get_portal_hits_recursively(data) + hits = [hit.get("uuid") for hit in hits] + return len(set(hits)) + def format_hit_property_values(hit: dict, property_name: str, color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow @@ -324,8 +344,8 @@ def print_hit_property_values(hit: dict, property_name: str, grouping = "RESULTS" grouping = f"{chars_diamond} {grouping}" hits = get_portal_hits(data) if (uuids is True) else [] + note = "" if isinstance(count := data.get("count"), int): - note = "" if len(hits) > count: note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") elif isinstance(items := data.get("items"), list): @@ -340,14 +360,10 @@ def print_hit_property_values(hit: dict, property_name: str, elif checks: note = f" {chars_check}" if not ((count == 0) and (len(hits) == 0) and (not note)): - if ((grouping_name in aggregation_fields) and - (len(hits) == 0) and isinstance(items := data.get("items"), list)): - # Count the actual hits for this noted aggregation field group. - items_nhits = 0 - for item in items: - items_nhits += len(get_portal_hits(item)) - if items_nhits > count: - note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") + if (len(hits) == 0) and isinstance(items := data.get("items"), list): + # Count the actual hits for this non-terminal group. + if (items_nhits := count_unique_portal_hits_recursively(items)) > count: + note += red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") print(f"{spaces}{grouping}: {count}{note}") if (query is True) and (query_string := data.get("query")): if _terminal_color == _html_color: @@ -356,7 +372,6 @@ def print_hit_property_values(hit: dict, property_name: str, print(f"{spaces} {query_string}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: - note = "" if hit.get("elasticsearch_counted") is False: print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) color = red_bold From c27f1a5e55efad282773f05aa944e16e9bcd2f16 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 16 Dec 2024 16:12:45 -0500 Subject: [PATCH 76/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 9 +++++---- src/encoded/metadata.py | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 2ff86d486..0a6dec01a 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -81,6 +81,7 @@ def get_unique_release_tracker_description_values(normalized_results: dict) -> L "name": AGGREGATION_FIELD_FILE_DESCRIPTOR, "value": unique_release_tracker_description_value, "count": 0, + "elasticsearch_counted": False, "debug_placeholder": True }) third_items_to_delete = [] @@ -241,7 +242,7 @@ def format_hit_property_values(hit: dict, property_name: str, color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow counted_elsewhere = [] - if hit.get("elasticsearch_counted") is False: + if hit.get("elasticsearch_counted", False) is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) else: counted_grouping_name, counted_grouping_value = (None, None) @@ -260,7 +261,7 @@ def format_hit_property_values(hit: dict, property_name: str, else: property_values.append(property_value) property_value = ", ".join(property_values) - elif hit.get("elasticsearch_counted") is False: + elif hit.get("elasticsearch_counted", False) is False: counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") @@ -291,7 +292,7 @@ def find_where(data: dict, uuid: str, elif isinstance(hits := data.get("debug", {}).get("portal_hits"), list): for hit in hits: if hit.get("uuid") == uuid: - if hit.get("elasticsearch_counted") is True: + if hit.get("elasticsearch_counted", False) is True: found_uuid_grouping_names_and_values.add((parent_grouping_name, parent_grouping_value)) return found_uuid_grouping_names_and_values @@ -372,7 +373,7 @@ def print_hit_property_values(hit: dict, property_name: str, print(f"{spaces} {query_string}") for hit in hits: if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: - if hit.get("elasticsearch_counted") is False: + if hit.get("elasticsearch_counted", False) is False: print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) color = red_bold else: diff --git a/src/encoded/metadata.py b/src/encoded/metadata.py index 6a4badf79..4eba39721 100644 --- a/src/encoded/metadata.py +++ b/src/encoded/metadata.py @@ -287,6 +287,11 @@ def peek_metadata(context, request): """ Helper for the UI that will retrieve faceting information about data retrieved from /metadata """ # get arguments from helper args = handle_metadata_arguments(context, request) + if isinstance(args, Response): + # dmichaels/2024-12-16: Hackish fix for now; handle_metadata_arguments not returning MetadataArgs for ... + subreq = make_search_subreq(request, '{}?{}'.format('/search', urlencode(request.params, True)), inherit_user=True) + result = search(context, subreq) + return result['facets'] # Generate search search_param = {} From c5e275a57b3ac46a235458214f08842aa1e671b4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 16 Dec 2024 16:23:52 -0500 Subject: [PATCH 77/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index 0a6dec01a..d73c3acd9 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -347,7 +347,7 @@ def print_hit_property_values(hit: dict, property_name: str, hits = get_portal_hits(data) if (uuids is True) else [] note = "" if isinstance(count := data.get("count"), int): - if len(hits) > count: + if (len(hits) > count) and (uuids is True): note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") elif isinstance(items := data.get("items"), list): subcount = 0 @@ -363,7 +363,7 @@ def print_hit_property_values(hit: dict, property_name: str, if not ((count == 0) and (len(hits) == 0) and (not note)): if (len(hits) == 0) and isinstance(items := data.get("items"), list): # Count the actual hits for this non-terminal group. - if (items_nhits := count_unique_portal_hits_recursively(items)) > count: + if ((items_nhits := count_unique_portal_hits_recursively(items)) > count) and (uuids is True): note += red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") print(f"{spaces}{grouping}: {count}{note}") if (query is True) and (query_string := data.get("query")): From 56ec43525941fecf50b3ec1d18dc1f3a168c3b07 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 16 Dec 2024 19:15:39 -0500 Subject: [PATCH 78/78] refactoring /recent_files_summary endpoint --- .../recent_files_summary_troubleshooting.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py index d73c3acd9..7c71391fe 100644 --- a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -349,6 +349,8 @@ def print_hit_property_values(hit: dict, property_name: str, if isinstance(count := data.get("count"), int): if (len(hits) > count) and (uuids is True): note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + if count == 0: + note = red(f' {chars_rarrow_hollow} UNCOUNTED') + note elif isinstance(items := data.get("items"), list): subcount = 0 for item in items: @@ -365,6 +367,8 @@ def print_hit_property_values(hit: dict, property_name: str, # Count the actual hits for this non-terminal group. if ((items_nhits := count_unique_portal_hits_recursively(items)) > count) and (uuids is True): note += red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") + if count == 0: + note = red(f' {chars_rarrow_hollow} UNCOUNTED') + note print(f"{spaces}{grouping}: {count}{note}") if (query is True) and (query_string := data.get("query")): if _terminal_color == _html_color: