Skip to content

Commit

Permalink
feat: turn into similarity ranking
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Feb 16, 2024
1 parent 5bde105 commit 3da9cb6
Showing 1 changed file with 74 additions and 96 deletions.
170 changes: 74 additions & 96 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,6 @@ def get_date_time(date_times):
)


def get_unique_list(lst: list[dict]):
"""Get unique list of dicts"""
return [dict(t) for t in {tuple(d.items()) for d in lst}]


class UUIDHiddenIdForm(forms.ModelForm):
"""Form to include UUID in inline formes (Table, Column and Coverage)"""

Expand Down Expand Up @@ -218,21 +213,21 @@ def coverage_type(self):

coverage_type.short_description = "Coverage Type"

def has_area_intersection(self, other: "Coverage"):
def similarity_of_area(self, other: "Coverage"):
if not self.area:
return False
return 0
if not other.area:
return False
return 0
if self.area.name.startswith(other.area.name):
return True
return 1
if other.area.name.startswith(self.area.name):
return True
return False
return 1
return 0

def has_datetime_intersection(self, other: "Coverage"):
def similarity_of_datetime(self, other: "Coverage"):
for dt_self in self.datetime_ranges.all():
for dt_other in other.datetime_ranges.all():
if dt_self.has_datetime_intersection(dt_other):
if dt_self.similarity_of_datetime(dt_other):
return True
return False

Expand Down Expand Up @@ -1141,28 +1136,54 @@ def neighbors(self):
- Tables and columns with similar directories
- Tables and columns with similar coverages or tags
"""
all_tables = (
Table.objects.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
)
all_neighbors = []
for column in self.columns.all():
for neighbor in column.neighbors:
all_neighbors.append(neighbor)
if len(all_neighbors) >= 20:
break
return sorted(all_neighbors, key=lambda item: item[3])[::-1]
for table in all_tables:
score_area = self.similarity_of_area(table)
score_datetime = self.similarity_of_datetime(table)
score_directory, cols = self.similarity_of_directory(table)
if score_directory:
all_neighbors.append(
(
cols,
table,
table.dataset,
score_area + score_datetime + score_directory,
)
)
logger.debug(f"[similarity_area] {self} {table} {score_area}")
logger.debug(f"[similarity_datetime] {self} {table} {score_datetime}")
logger.debug(f"[similarity_directory] {self} {table} {score_directory}")

return sorted(all_neighbors, key=lambda item: item[-1])[::-1][:20]

def get_graphql_neighbors(self) -> list[dict]:
all_neighbors = []
for column, table, dataset, _ in self.neighbors:
for columns, table, dataset, score in self.neighbors:
column_id = []
column_name = []
for column in columns:
column_id.append(str(column.id))
column_name.append(column.name)
all_neighbors.append(
{
"column_id": str(column.id),
"column_name": column.name,
"column_id": column_id,
"column_name": column_name,
"table_id": str(table.id),
"table_name": table.name,
"dataset_id": str(dataset.id),
"dataset_name": dataset.name,
"score": score,
}
)
return get_unique_list(all_neighbors)
return all_neighbors

@property
def last_updated_at(self):
Expand All @@ -1172,29 +1193,35 @@ def last_updated_at(self):
def get_graphql_last_updated_at(self):
return self.last_updated_at

def has_area_intersection(self, other: "Table"):
def similarity_of_area(self, other: "Table"):
count_all = 0
count_yes = 0
for cov_self in self.coverages.all():
for cov_other in other.coverages.all():
if cov_self.has_area_intersection(cov_other):
logger.debug(f"[table_neighbor_by_area] {self.name} => {other.name}")
return True
return False

def has_datetime_intersection(self, other: "Table"):
count_all += 1
if cov_self.similarity_of_area(cov_other):
count_yes += 1
return count_yes / count_all if count_all else 0

def similarity_of_datetime(self, other: "Table"):
count_all = 0
count_yes = 0
for cov_self in self.coverages.all():
for cov_other in other.coverages.all():
if cov_self.has_datetime_intersection(cov_other):
logger.debug(f"[table_neighbor_by_date] {self.name} => {other.name}")
return True
return False

def has_directory_intersection(self, other: "Table"):
for col_self in self.columns.all():
for col_other in other.columns.all():
if col_self.has_directory_intersection(col_other):
logger.debug(f"[table_neighbor_by_dire] {self.name} => {other.name}")
return True
return False
count_all += 1
if cov_self.similarity_of_datetime(cov_other):
count_yes = +1
return count_yes / count_all if count_all else 0

def similarity_of_directory(self, other: "Table"):
self_cols = self.columns.all()
self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
other_cols = other.columns.all()
other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
intersection = set([*self_dirs, *other_dirs])
intersection_size = len(intersection)
intersection_max_size = min(len(self_cols), len(other_cols))
return intersection_size / intersection_max_size, intersection

def clean(self):
"""
Expand Down Expand Up @@ -1370,55 +1397,6 @@ def full_coverage(self) -> str:
def get_graphql_full_coverage(self):
return self.full_coverage

@property
def neighbors(self):
"""Similiar tables and columns
- Columns with similar directories
- Columns with similar coverages or tags
"""
if not self.directory_primary_key:
return []
all_columns = (
Column.objects.filter(directory_primary_key=self.directory_primary_key)
.exclude(id=self.id)
.all()
)
all_neighbors = []
for column in all_columns:
if self.table.has_area_intersection(column.table):
if self.table.has_datetime_intersection(column.table):
all_neighbors.append(
(
column,
column.table,
column.table.dataset,
column.table.dataset.page_views,
)
)
if len(all_neighbors) >= 20:
break
return sorted(all_neighbors, key=lambda item: item[3])[::-1]

def get_graphql_neighbors(self) -> list[dict]:
all_neighbors = []
for column, table, dataset, _ in self.neighbors:
all_neighbors.append(
{
"column_id": str(column.id),
"column_name": column.name,
"table_id": str(table.id),
"table_name": table.name,
"dataset_id": str(dataset.id),
"dataset_name": dataset.name,
}
)
return get_unique_list(all_neighbors)

def has_directory_intersection(self, other: "Column"):
if self.directory_primary_key == other.directory_primary_key:
return True
return False

def clean(self) -> None:
"""Clean method for Column model"""
errors = {}
Expand Down Expand Up @@ -1880,14 +1858,14 @@ def until(self):
self.end_second or 0,
)

def has_datetime_intersection(self, other: "DateTimeRange"):
def similarity_of_datetime(self, other: "DateTimeRange"):
if not self.since:
return False
return 0
if not other.until:
return False
return 0
if self.until >= other.since:
return True
return False
return 1
return 0

def clean(self) -> None:
"""
Expand Down

0 comments on commit 3da9cb6

Please sign in to comment.