Skip to content

Commit

Permalink
fix: improve neighbors score heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Feb 27, 2024
1 parent 6809f2c commit 5f84006
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 26 deletions.
1 change: 1 addition & 0 deletions bd_api/apps/api/v1/forms/admin_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class Meta(UUIDHiddenIdForm.Meta):
"is_primary_key",
"table",
"observation_level",
"directory_primary_key",
]
readonly_fields = [
"order",
Expand Down
79 changes: 53 additions & 26 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
from collections import defaultdict
from datetime import datetime
from math import log10
from uuid import uuid4

from django.core.exceptions import ValidationError
Expand Down Expand Up @@ -551,6 +552,14 @@ def full_slug(self):
return f"{self.organization.area.slug}_{self.organization.slug}_{self.slug}"
return f"{self.organization.slug}_{self.slug}"

@property
def popularity(self):
if not self.page_views:
return 0.0
if self.page_views < 1:
return 0.0
return log10(self.page_views)

@property
def coverage(self):
"""Get the temporal coverage of the dataset in the format YYYY-MM-DD - YYYY-MM-DD"""
Expand Down Expand Up @@ -692,9 +701,16 @@ def full_coverage(self) -> str:
return json.dumps(full_coverage_dict)

@property
def contains_tables(self):
"""Returns true if there are tables in the dataset"""
return len(self.tables.all()) > 0
def contains_open_data(self):
"""Returns true if there are tables or columns with open coverages"""
open_data = False
tables = self.tables.all()
for table in tables:
table_coverages = table.coverages.filter(is_closed=False)
if table_coverages:
open_data = True
break
return open_data

@property
def contains_closed_data(self):
Expand All @@ -714,17 +730,9 @@ def contains_closed_data(self):
return closed_data

@property
def contains_open_data(self):
"""Returns true if there are tables or columns with open coverages"""
open_data = False
tables = self.tables.all()
for table in tables:
table_coverages = table.coverages.filter(is_closed=False)
if table_coverages:
open_data = True
break

return open_data
def contains_tables(self):
"""Returns true if there are tables in the dataset"""
return len(self.tables.all()) > 0

@property
def contains_raw_data_sources(self):
Expand Down Expand Up @@ -1020,20 +1028,31 @@ def neighbors(self) -> list[dict]:
- Tables and columns with similar directories
- Tables and columns with similar coverages or tags
"""
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
if not self_directories:
return []
all_tables = (
Table.objects.exclude(id=self.id)
Table.objects
.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
)
) # fmt: skip
all_neighbors = []
for table in all_tables:
score_area = self.get_similarity_of_area(table)
score_datetime = self.get_similarity_of_datetime(table)
score_directory, columns = self.get_similarity_of_directory(table)
if not score_directory:
score_popularity = table.dataset.popularity
if not score_area or not score_datetime or not score_directory:
continue
column_id = []
column_name = []
Expand All @@ -1048,7 +1067,7 @@ def neighbors(self) -> list[dict]:
"table_name": table.name,
"dataset_id": str(table.dataset.id),
"dataset_name": table.dataset.name,
"score": round(score_area + score_datetime + score_directory, 2),
"score": round(score_directory, 2) + score_popularity,
}
)
return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20]
Expand Down Expand Up @@ -1077,14 +1096,22 @@ def get_similarity_of_datetime(self, other: "Table"):
return count_yes / count_all if count_all else 0

def get_similarity_of_directory(self, other: "Table"):
self_cols = self.columns.all()
self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
other_cols = other.columns.all()
other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
intersection = set([*self_dirs, *other_dirs])
intersection_size = len(intersection)
intersection_max_size = min(len(self_cols), len(other_cols))
return intersection_size / intersection_max_size, intersection
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
other_columns = (
other.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
other_directories = set(c.directory_primary_key for c in other_columns)
intersection = self_directories.intersection(other_directories)
return len(intersection) / len(self_directories), intersection

def clean(self):
"""
Expand Down

0 comments on commit 5f84006

Please sign in to comment.