diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 733e04e4..0aad7cae 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -3,6 +3,7 @@ import json from collections import defaultdict from datetime import datetime +from math import log10 from uuid import uuid4 from django.core.exceptions import ValidationError @@ -551,6 +552,14 @@ def full_slug(self): return f"{self.organization.area.slug}_{self.organization.slug}_{self.slug}" return f"{self.organization.slug}_{self.slug}" + @property + def popularity(self): + if not self.page_views: + return 0.0 + if self.page_views < 1: + return 0.0 + return log10(self.page_views) + @property def coverage(self): """Get the temporal coverage of the dataset in the format YYYY-MM-DD - YYYY-MM-DD""" @@ -692,9 +701,16 @@ def full_coverage(self) -> str: return json.dumps(full_coverage_dict) @property - def contains_tables(self): - """Returns true if there are tables in the dataset""" - return len(self.tables.all()) > 0 + def contains_open_data(self): + """Returns true if there are tables or columns with open coverages""" + open_data = False + tables = self.tables.all() + for table in tables: + table_coverages = table.coverages.filter(is_closed=False) + if table_coverages: + open_data = True + break + return open_data @property def contains_closed_data(self): @@ -714,17 +730,9 @@ def contains_closed_data(self): return closed_data @property - def contains_open_data(self): - """Returns true if there are tables or columns with open coverages""" - open_data = False - tables = self.tables.all() - for table in tables: - table_coverages = table.coverages.filter(is_closed=False) - if table_coverages: - open_data = True - break - - return open_data + def contains_tables(self): + """Returns true if there are tables in the dataset""" + return len(self.tables.all()) > 0 @property def contains_closed_tables(self): @@ -1042,8 +1050,7 @@ def neighbors(self) -> list[dict]: ) all_neighbors = [] for table in all_tables: - score_area = self.get_similarity_of_area(table) - score_datetime = self.get_similarity_of_datetime(table) + score_popularity = table.dataset.popularity score_directory, columns = self.get_similarity_of_directory(table) if not score_directory: continue @@ -1060,7 +1067,7 @@ def neighbors(self) -> list[dict]: "table_name": table.name, "dataset_id": str(table.dataset.id), "dataset_name": table.dataset.name, - "score": round(score_area + score_datetime + score_directory, 2), + "score": round(score_directory, 2) + score_popularity, } ) return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20] @@ -1089,13 +1096,21 @@ def get_similarity_of_datetime(self, other: "Table"): return count_yes / count_all if count_all else 0 def get_similarity_of_directory(self, other: "Table"): - self_cols = self.columns.all() - self_dirs = self.columns.filter(directory_primary_key__isnull=False).all() - other_cols = other.columns.all() - other_dirs = other.columns.filter(directory_primary_key__isnull=False).all() + self_dirs = ( + self.columns + .filter(directory_primary_key__isnull=False) + .exclude(table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + other_dirs = ( + other.columns + .filter(directory_primary_key__isnull=False) + .exclude(table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip intersection = set([*self_dirs, *other_dirs]) intersection_size = len(intersection) - intersection_max_size = min(len(self_cols), len(other_cols)) + intersection_max_size = len(self_dirs) or 1 return intersection_size / intersection_max_size, intersection def clean(self):