Skip to content

Commit

Permalink
feat: rank and filter table/column neighbors (#565)
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna authored Feb 16, 2024
1 parent ef12b03 commit 10f79b1
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 73 deletions.
205 changes: 133 additions & 72 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from django.core.exceptions import ValidationError
from django.db import models
from django.urls import reverse
from loguru import logger
from ordered_model.models import OrderedModel

from bd_api.apps.account.models import Account
Expand Down Expand Up @@ -65,11 +66,6 @@ def get_date_time(date_times):
)


def get_unique_list(lst: list[dict]):
"""Get unique list of dicts"""
return [dict(t) for t in {tuple(d.items()) for d in lst}]


class UUIDHiddenIdForm(forms.ModelForm):
"""Form to include UUID in inline formes (Table, Column and Coverage)"""

Expand Down Expand Up @@ -217,6 +213,24 @@ def coverage_type(self):

coverage_type.short_description = "Coverage Type"

def similarity_of_area(self, other: "Coverage"):
if not self.area:
return 0
if not other.area:
return 0
if self.area.name.startswith(other.area.name):
return 1
if other.area.name.startswith(self.area.name):
return 1
return 0

def similarity_of_datetime(self, other: "Coverage"):
for dt_self in self.datetime_ranges.all():
for dt_other in other.datetime_ranges.all():
if dt_self.similarity_of_datetime(dt_other):
return 1
return 0

def clean(self) -> None:
"""
Assert that only one of "table", "raw_data_source",
Expand Down Expand Up @@ -893,10 +907,8 @@ def clean(self) -> None:
"One and only one of 'table', "
"'raw_data_source', or 'information_request' must be set."
)

if self.entity.category.slug != "datetime":
raise ValidationError("Entity's category is not in category.slug = `datetime`.")

return super().clean()


Expand Down Expand Up @@ -1122,28 +1134,56 @@ def get_graphql_full_coverage(self):
def neighbors(self):
"""Similiar tables and columns
- Tables and columns with similar directories
- Tables and columns with similar coverages or tags (WIP)
- Tables and columns with similar coverages or tags
"""
all_tables = (
Table.objects.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
)
all_neighbors = []
for column in self.columns.all():
for neighbor in column.neighbors:
all_neighbors.append(neighbor)
return all_neighbors
for table in all_tables:
score_area = self.similarity_of_area(table)
score_datetime = self.similarity_of_datetime(table)
score_directory, cols = self.similarity_of_directory(table)
if score_directory:
all_neighbors.append(
(
cols,
table,
table.dataset,
score_area + score_datetime + score_directory,
)
)
logger.debug(f"[similarity_area] {self} {table} {score_area}")
logger.debug(f"[similarity_datetime] {self} {table} {score_datetime}")
logger.debug(f"[similarity_directory] {self} {table} {score_directory}")

return sorted(all_neighbors, key=lambda item: item[-1])[::-1][:20]

def get_graphql_neighbors(self) -> list[dict]:
all_neighbors = []
for column, table, dataset in self.neighbors:
for columns, table, dataset, score in self.neighbors:
column_id = []
column_name = []
for column in columns:
column_id.append(str(column.id))
column_name.append(column.name)
all_neighbors.append(
{
"column_id": str(column.id),
"column_name": column.name,
"column_id": column_id,
"column_name": column_name,
"table_id": str(table.id),
"table_name": table.name,
"dataset_id": str(dataset.id),
"dataset_name": dataset.name,
"score": score,
}
)
return get_unique_list(all_neighbors)
return all_neighbors

@property
def last_updated_at(self):
Expand All @@ -1153,6 +1193,34 @@ def last_updated_at(self):
def get_graphql_last_updated_at(self):
return self.last_updated_at

def similarity_of_area(self, other: "Table"):
count_all = 0
count_yes = 0
for cov_self in self.coverages.all():
for cov_other in other.coverages.all():
count_all += 1
count_yes += cov_self.similarity_of_area(cov_other)
return count_yes / count_all if count_all else 0

def similarity_of_datetime(self, other: "Table"):
count_all = 0
count_yes = 0
for cov_self in self.coverages.all():
for cov_other in other.coverages.all():
count_all += 1
count_yes += cov_self.similarity_of_datetime(cov_other)
return count_yes / count_all if count_all else 0

def similarity_of_directory(self, other: "Table"):
self_cols = self.columns.all()
self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
other_cols = other.columns.all()
other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
intersection = set([*self_dirs, *other_dirs])
intersection_size = len(intersection)
intersection_max_size = min(len(self_cols), len(other_cols))
return intersection_size / intersection_max_size, intersection

def clean(self):
"""
Clean method for Table model
Expand Down Expand Up @@ -1275,24 +1343,6 @@ class Meta:
verbose_name_plural = "Columns"
ordering = ["name"]

def clean(self) -> None:
"""Clean method for Column model"""
errors = {}
if self.observation_level and self.observation_level.table != self.table:
errors[
"observation_level"
] = "Observation level is not in the same table as the column."

if self.directory_primary_key and self.directory_primary_key.table.is_directory is False:
errors[
"directory_primary_key"
] = "Column indicated as a directory's primary key is not in a directory."

if errors:
raise ValidationError(errors)

return super().clean()

@property
def full_coverage(self) -> str:
"""
Expand Down Expand Up @@ -1345,44 +1395,20 @@ def full_coverage(self) -> str:
def get_graphql_full_coverage(self):
return self.full_coverage

@property
def neighbors(self):
"""Similiar tables and columns
- Columns with similar directories
- Columns with similar coverages or tags (WIP)
"""
if not self.directory_primary_key:
return []
all_columns = (
Column.objects.filter(directory_primary_key=self.directory_primary_key)
.exclude(id=self.id)
.all()
)
all_neighbors = []
for column in all_columns:
all_neighbors.append(
(
column,
column.table,
column.table.dataset,
)
)
return all_neighbors

def get_graphql_neighbors(self) -> list[dict]:
all_neighbors = []
for column, table, dataset in self.neighbors:
all_neighbors.append(
{
"column_id": str(column.id),
"column_name": column.name,
"table_id": str(table.id),
"table_name": table.name,
"dataset_id": str(dataset.id),
"dataset_name": dataset.name,
}
)
return get_unique_list(all_neighbors)
def clean(self) -> None:
"""Clean method for Column model"""
errors = {}
if self.observation_level and self.observation_level.table != self.table:
errors[
"observation_level"
] = "Observation level is not in the same table as the column."
if self.directory_primary_key and self.directory_primary_key.table.is_directory is False:
errors[
"directory_primary_key"
] = "Column indicated as a directory's primary key is not in a directory."
if errors:
raise ValidationError(errors)
return super().clean()


class ColumnOriginalName(BaseModel):
Expand Down Expand Up @@ -1806,6 +1832,41 @@ class Meta:
verbose_name_plural = "DateTime Ranges"
ordering = ["id"]

@property
def since(self):
if self.start_year:
return datetime(
self.start_year,
self.start_month or 1,
self.start_day or 1,
self.start_hour or 0,
self.start_minute or 0,
self.start_second or 0,
)

@property
def until(self):
if self.end_year:
return datetime(
self.end_year,
self.end_month or 1,
self.end_day or 1,
self.end_hour or 0,
self.end_minute or 0,
self.end_second or 0,
)

def similarity_of_datetime(self, other: "DateTimeRange"):
if not self.since:
return 0
if not other.until:
return 0
if self.since <= other.until:
return 1
if self.until >= other.since:
return 1
return 0

def clean(self) -> None:
"""
Assert that start_year <= end_year and start_month <= end_month
Expand Down
2 changes: 1 addition & 1 deletion bd_api/custom/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from loguru import logger

LOGGER_LEVEL = getenv("LOGGER_LEVEL", "INFO")
LOGGER_LEVEL = getenv("LOGGER_LEVEL", "DEBUG")
LOGGER_IGNORE = getenv("LOGGER_IGNORE", "").split(",")
LOGGER_SERIALIZE = bool(getenv("LOGGER_SERIALIZE", False))
LOGGER_FORMAT = "[{time:YYYY-MM-DD HH:mm:ss}] <lvl>{message}</>"
Expand Down

0 comments on commit 10f79b1

Please sign in to comment.