Skip to content

Commit

Permalink
fix: improve neighbors score heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Feb 27, 2024
1 parent d094d2f commit 15894d1
Showing 1 changed file with 37 additions and 22 deletions.
59 changes: 37 additions & 22 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
from collections import defaultdict
from datetime import datetime
from math import log10
from uuid import uuid4

from django.core.exceptions import ValidationError
Expand Down Expand Up @@ -551,6 +552,14 @@ def full_slug(self):
return f"{self.organization.area.slug}_{self.organization.slug}_{self.slug}"
return f"{self.organization.slug}_{self.slug}"

@property
def popularity(self):
if not self.page_views:
return 0.0
if self.page_views < 1:
return 0.0
return log10(self.page_views)

@property
def coverage(self):
"""Get the temporal coverage of the dataset in the format YYYY-MM-DD - YYYY-MM-DD"""
Expand Down Expand Up @@ -692,9 +701,16 @@ def full_coverage(self) -> str:
return json.dumps(full_coverage_dict)

@property
def contains_tables(self):
"""Returns true if there are tables in the dataset"""
return len(self.tables.all()) > 0
def contains_open_data(self):
"""Returns true if there are tables or columns with open coverages"""
open_data = False
tables = self.tables.all()
for table in tables:
table_coverages = table.coverages.filter(is_closed=False)
if table_coverages:
open_data = True
break
return open_data

@property
def contains_closed_data(self):
Expand All @@ -714,17 +730,9 @@ def contains_closed_data(self):
return closed_data

@property
def contains_open_data(self):
"""Returns true if there are tables or columns with open coverages"""
open_data = False
tables = self.tables.all()
for table in tables:
table_coverages = table.coverages.filter(is_closed=False)
if table_coverages:
open_data = True
break

return open_data
def contains_tables(self):
"""Returns true if there are tables in the dataset"""
return len(self.tables.all()) > 0

@property
def contains_closed_tables(self):
Expand Down Expand Up @@ -1042,8 +1050,7 @@ def neighbors(self) -> list[dict]:
)
all_neighbors = []
for table in all_tables:
score_area = self.get_similarity_of_area(table)
score_datetime = self.get_similarity_of_datetime(table)
score_popularity = table.dataset.popularity
score_directory, columns = self.get_similarity_of_directory(table)
if not score_directory:
continue
Expand All @@ -1060,7 +1067,7 @@ def neighbors(self) -> list[dict]:
"table_name": table.name,
"dataset_id": str(table.dataset.id),
"dataset_name": table.dataset.name,
"score": round(score_area + score_datetime + score_directory, 2),
"score": round(score_directory, 2) + score_popularity,
}
)
return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20]
Expand Down Expand Up @@ -1089,13 +1096,21 @@ def get_similarity_of_datetime(self, other: "Table"):
return count_yes / count_all if count_all else 0

def get_similarity_of_directory(self, other: "Table"):
self_cols = self.columns.all()
self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
other_cols = other.columns.all()
other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
self_dirs = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
other_dirs = (
other.columns
.filter(directory_primary_key__isnull=False)
.exclude(table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
intersection = set([*self_dirs, *other_dirs])
intersection_size = len(intersection)
intersection_max_size = min(len(self_cols), len(other_cols))
intersection_max_size = len(self_dirs) or 1
return intersection_size / intersection_max_size, intersection

def clean(self):
Expand Down

0 comments on commit 15894d1

Please sign in to comment.