From 5bde105606d1ff987bb583a5cf81d0ba39aeaac4 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Wed, 14 Feb 2024 14:54:58 -0300 Subject: [PATCH 1/2] feat:rank and filter table/column neighbors --- bd_api/apps/api/v1/models.py | 149 +++++++++++++++++++++++++++-------- bd_api/custom/logger.py | 2 +- 2 files changed, 117 insertions(+), 34 deletions(-) diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 569cfda8..9f403882 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -10,6 +10,7 @@ from django.core.exceptions import ValidationError from django.db import models from django.urls import reverse +from loguru import logger from ordered_model.models import OrderedModel from bd_api.apps.account.models import Account @@ -217,6 +218,24 @@ def coverage_type(self): coverage_type.short_description = "Coverage Type" + def has_area_intersection(self, other: "Coverage"): + if not self.area: + return False + if not other.area: + return False + if self.area.name.startswith(other.area.name): + return True + if other.area.name.startswith(self.area.name): + return True + return False + + def has_datetime_intersection(self, other: "Coverage"): + for dt_self in self.datetime_ranges.all(): + for dt_other in other.datetime_ranges.all(): + if dt_self.has_datetime_intersection(dt_other): + return True + return False + def clean(self) -> None: """ Assert that only one of "table", "raw_data_source", @@ -893,10 +912,8 @@ def clean(self) -> None: "One and only one of 'table', " "'raw_data_source', or 'information_request' must be set." ) - if self.entity.category.slug != "datetime": raise ValidationError("Entity's category is not in category.slug = `datetime`.") - return super().clean() @@ -1122,17 +1139,19 @@ def get_graphql_full_coverage(self): def neighbors(self): """Similiar tables and columns - Tables and columns with similar directories - - Tables and columns with similar coverages or tags (WIP) + - Tables and columns with similar coverages or tags """ all_neighbors = [] for column in self.columns.all(): for neighbor in column.neighbors: all_neighbors.append(neighbor) - return all_neighbors + if len(all_neighbors) >= 20: + break + return sorted(all_neighbors, key=lambda item: item[3])[::-1] def get_graphql_neighbors(self) -> list[dict]: all_neighbors = [] - for column, table, dataset in self.neighbors: + for column, table, dataset, _ in self.neighbors: all_neighbors.append( { "column_id": str(column.id), @@ -1153,6 +1172,30 @@ def last_updated_at(self): def get_graphql_last_updated_at(self): return self.last_updated_at + def has_area_intersection(self, other: "Table"): + for cov_self in self.coverages.all(): + for cov_other in other.coverages.all(): + if cov_self.has_area_intersection(cov_other): + logger.debug(f"[table_neighbor_by_area] {self.name} => {other.name}") + return True + return False + + def has_datetime_intersection(self, other: "Table"): + for cov_self in self.coverages.all(): + for cov_other in other.coverages.all(): + if cov_self.has_datetime_intersection(cov_other): + logger.debug(f"[table_neighbor_by_date] {self.name} => {other.name}") + return True + return False + + def has_directory_intersection(self, other: "Table"): + for col_self in self.columns.all(): + for col_other in other.columns.all(): + if col_self.has_directory_intersection(col_other): + logger.debug(f"[table_neighbor_by_dire] {self.name} => {other.name}") + return True + return False + def clean(self): """ Clean method for Table model @@ -1275,24 +1318,6 @@ class Meta: verbose_name_plural = "Columns" ordering = ["name"] - def clean(self) -> None: - """Clean method for Column model""" - errors = {} - if self.observation_level and self.observation_level.table != self.table: - errors[ - "observation_level" - ] = "Observation level is not in the same table as the column." - - if self.directory_primary_key and self.directory_primary_key.table.is_directory is False: - errors[ - "directory_primary_key" - ] = "Column indicated as a directory's primary key is not in a directory." - - if errors: - raise ValidationError(errors) - - return super().clean() - @property def full_coverage(self) -> str: """ @@ -1349,7 +1374,7 @@ def get_graphql_full_coverage(self): def neighbors(self): """Similiar tables and columns - Columns with similar directories - - Columns with similar coverages or tags (WIP) + - Columns with similar coverages or tags """ if not self.directory_primary_key: return [] @@ -1360,18 +1385,23 @@ def neighbors(self): ) all_neighbors = [] for column in all_columns: - all_neighbors.append( - ( - column, - column.table, - column.table.dataset, - ) - ) - return all_neighbors + if self.table.has_area_intersection(column.table): + if self.table.has_datetime_intersection(column.table): + all_neighbors.append( + ( + column, + column.table, + column.table.dataset, + column.table.dataset.page_views, + ) + ) + if len(all_neighbors) >= 20: + break + return sorted(all_neighbors, key=lambda item: item[3])[::-1] def get_graphql_neighbors(self) -> list[dict]: all_neighbors = [] - for column, table, dataset in self.neighbors: + for column, table, dataset, _ in self.neighbors: all_neighbors.append( { "column_id": str(column.id), @@ -1384,6 +1414,26 @@ def get_graphql_neighbors(self) -> list[dict]: ) return get_unique_list(all_neighbors) + def has_directory_intersection(self, other: "Column"): + if self.directory_primary_key == other.directory_primary_key: + return True + return False + + def clean(self) -> None: + """Clean method for Column model""" + errors = {} + if self.observation_level and self.observation_level.table != self.table: + errors[ + "observation_level" + ] = "Observation level is not in the same table as the column." + if self.directory_primary_key and self.directory_primary_key.table.is_directory is False: + errors[ + "directory_primary_key" + ] = "Column indicated as a directory's primary key is not in a directory." + if errors: + raise ValidationError(errors) + return super().clean() + class ColumnOriginalName(BaseModel): """Model definition for ColumnOriginalName.""" @@ -1806,6 +1856,39 @@ class Meta: verbose_name_plural = "DateTime Ranges" ordering = ["id"] + @property + def since(self): + if self.start_year: + return datetime( + self.start_year, + self.start_month or 1, + self.start_day or 1, + self.start_hour or 0, + self.start_minute or 0, + self.start_second or 0, + ) + + @property + def until(self): + if self.end_year: + return datetime( + self.end_year, + self.end_month or 1, + self.end_day or 1, + self.end_hour or 0, + self.end_minute or 0, + self.end_second or 0, + ) + + def has_datetime_intersection(self, other: "DateTimeRange"): + if not self.since: + return False + if not other.until: + return False + if self.until >= other.since: + return True + return False + def clean(self) -> None: """ Assert that start_year <= end_year and start_month <= end_month diff --git a/bd_api/custom/logger.py b/bd_api/custom/logger.py index 5aadc9c7..5d3ebe1d 100644 --- a/bd_api/custom/logger.py +++ b/bd_api/custom/logger.py @@ -5,7 +5,7 @@ from loguru import logger -LOGGER_LEVEL = getenv("LOGGER_LEVEL", "INFO") +LOGGER_LEVEL = getenv("LOGGER_LEVEL", "DEBUG") LOGGER_IGNORE = getenv("LOGGER_IGNORE", "").split(",") LOGGER_SERIALIZE = bool(getenv("LOGGER_SERIALIZE", False)) LOGGER_FORMAT = "[{time:YYYY-MM-DD HH:mm:ss}] {message}" From bcfcfd1393c03a940541ba7286df948056964cfb Mon Sep 17 00:00:00 2001 From: Vinicius Date: Fri, 16 Feb 2024 09:57:16 -0300 Subject: [PATCH 2/2] feat: turn into similarity ranking --- bd_api/apps/api/v1/models.py | 172 +++++++++++++++-------------------- 1 file changed, 75 insertions(+), 97 deletions(-) diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 9f403882..2a05cd76 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -66,11 +66,6 @@ def get_date_time(date_times): ) -def get_unique_list(lst: list[dict]): - """Get unique list of dicts""" - return [dict(t) for t in {tuple(d.items()) for d in lst}] - - class UUIDHiddenIdForm(forms.ModelForm): """Form to include UUID in inline formes (Table, Column and Coverage)""" @@ -218,23 +213,23 @@ def coverage_type(self): coverage_type.short_description = "Coverage Type" - def has_area_intersection(self, other: "Coverage"): + def similarity_of_area(self, other: "Coverage"): if not self.area: - return False + return 0 if not other.area: - return False + return 0 if self.area.name.startswith(other.area.name): - return True + return 1 if other.area.name.startswith(self.area.name): - return True - return False + return 1 + return 0 - def has_datetime_intersection(self, other: "Coverage"): + def similarity_of_datetime(self, other: "Coverage"): for dt_self in self.datetime_ranges.all(): for dt_other in other.datetime_ranges.all(): - if dt_self.has_datetime_intersection(dt_other): - return True - return False + if dt_self.similarity_of_datetime(dt_other): + return 1 + return 0 def clean(self) -> None: """ @@ -1141,28 +1136,54 @@ def neighbors(self): - Tables and columns with similar directories - Tables and columns with similar coverages or tags """ + all_tables = ( + Table.objects.exclude(id=self.id) + .exclude(is_directory=True) + .exclude(status__slug__in=["under_review"]) + .filter(columns__directory_primary_key__isnull=False) + .distinct() + .all() + ) all_neighbors = [] - for column in self.columns.all(): - for neighbor in column.neighbors: - all_neighbors.append(neighbor) - if len(all_neighbors) >= 20: - break - return sorted(all_neighbors, key=lambda item: item[3])[::-1] + for table in all_tables: + score_area = self.similarity_of_area(table) + score_datetime = self.similarity_of_datetime(table) + score_directory, cols = self.similarity_of_directory(table) + if score_directory: + all_neighbors.append( + ( + cols, + table, + table.dataset, + score_area + score_datetime + score_directory, + ) + ) + logger.debug(f"[similarity_area] {self} {table} {score_area}") + logger.debug(f"[similarity_datetime] {self} {table} {score_datetime}") + logger.debug(f"[similarity_directory] {self} {table} {score_directory}") + + return sorted(all_neighbors, key=lambda item: item[-1])[::-1][:20] def get_graphql_neighbors(self) -> list[dict]: all_neighbors = [] - for column, table, dataset, _ in self.neighbors: + for columns, table, dataset, score in self.neighbors: + column_id = [] + column_name = [] + for column in columns: + column_id.append(str(column.id)) + column_name.append(column.name) all_neighbors.append( { - "column_id": str(column.id), - "column_name": column.name, + "column_id": column_id, + "column_name": column_name, "table_id": str(table.id), "table_name": table.name, "dataset_id": str(dataset.id), "dataset_name": dataset.name, + "score": score, } ) - return get_unique_list(all_neighbors) + return all_neighbors @property def last_updated_at(self): @@ -1172,29 +1193,33 @@ def last_updated_at(self): def get_graphql_last_updated_at(self): return self.last_updated_at - def has_area_intersection(self, other: "Table"): + def similarity_of_area(self, other: "Table"): + count_all = 0 + count_yes = 0 for cov_self in self.coverages.all(): for cov_other in other.coverages.all(): - if cov_self.has_area_intersection(cov_other): - logger.debug(f"[table_neighbor_by_area] {self.name} => {other.name}") - return True - return False + count_all += 1 + count_yes += cov_self.similarity_of_area(cov_other) + return count_yes / count_all if count_all else 0 - def has_datetime_intersection(self, other: "Table"): + def similarity_of_datetime(self, other: "Table"): + count_all = 0 + count_yes = 0 for cov_self in self.coverages.all(): for cov_other in other.coverages.all(): - if cov_self.has_datetime_intersection(cov_other): - logger.debug(f"[table_neighbor_by_date] {self.name} => {other.name}") - return True - return False - - def has_directory_intersection(self, other: "Table"): - for col_self in self.columns.all(): - for col_other in other.columns.all(): - if col_self.has_directory_intersection(col_other): - logger.debug(f"[table_neighbor_by_dire] {self.name} => {other.name}") - return True - return False + count_all += 1 + count_yes += cov_self.similarity_of_datetime(cov_other) + return count_yes / count_all if count_all else 0 + + def similarity_of_directory(self, other: "Table"): + self_cols = self.columns.all() + self_dirs = self.columns.filter(directory_primary_key__isnull=False).all() + other_cols = other.columns.all() + other_dirs = other.columns.filter(directory_primary_key__isnull=False).all() + intersection = set([*self_dirs, *other_dirs]) + intersection_size = len(intersection) + intersection_max_size = min(len(self_cols), len(other_cols)) + return intersection_size / intersection_max_size, intersection def clean(self): """ @@ -1370,55 +1395,6 @@ def full_coverage(self) -> str: def get_graphql_full_coverage(self): return self.full_coverage - @property - def neighbors(self): - """Similiar tables and columns - - Columns with similar directories - - Columns with similar coverages or tags - """ - if not self.directory_primary_key: - return [] - all_columns = ( - Column.objects.filter(directory_primary_key=self.directory_primary_key) - .exclude(id=self.id) - .all() - ) - all_neighbors = [] - for column in all_columns: - if self.table.has_area_intersection(column.table): - if self.table.has_datetime_intersection(column.table): - all_neighbors.append( - ( - column, - column.table, - column.table.dataset, - column.table.dataset.page_views, - ) - ) - if len(all_neighbors) >= 20: - break - return sorted(all_neighbors, key=lambda item: item[3])[::-1] - - def get_graphql_neighbors(self) -> list[dict]: - all_neighbors = [] - for column, table, dataset, _ in self.neighbors: - all_neighbors.append( - { - "column_id": str(column.id), - "column_name": column.name, - "table_id": str(table.id), - "table_name": table.name, - "dataset_id": str(dataset.id), - "dataset_name": dataset.name, - } - ) - return get_unique_list(all_neighbors) - - def has_directory_intersection(self, other: "Column"): - if self.directory_primary_key == other.directory_primary_key: - return True - return False - def clean(self) -> None: """Clean method for Column model""" errors = {} @@ -1880,14 +1856,16 @@ def until(self): self.end_second or 0, ) - def has_datetime_intersection(self, other: "DateTimeRange"): + def similarity_of_datetime(self, other: "DateTimeRange"): if not self.since: - return False + return 0 if not other.until: - return False + return 0 + if self.since <= other.until: + return 1 if self.until >= other.since: - return True - return False + return 1 + return 0 def clean(self) -> None: """