Skip to content

Commit

Permalink
feat: add table neighbor model (#571)
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna authored Mar 21, 2024
1 parent 7b18aad commit 44f20d6
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 48 deletions.
32 changes: 32 additions & 0 deletions bd_api/apps/api/v1/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
RawDataSource,
Status,
Table,
TableNeighbor,
Tag,
Theme,
Update,
Expand All @@ -66,6 +67,7 @@
update_page_views_task,
update_search_index_task,
update_table_metadata_task,
update_table_neighbors_task,
)
from bd_api.custom.client import get_gbq_client

Expand Down Expand Up @@ -262,6 +264,14 @@ def update_table_metadata(modeladmin: ModelAdmin, request: HttpRequest, queryset
update_table_metadata.short_description = "Atualizar metadados das tabelas"


def update_table_neighbors(modeladmin: ModelAdmin, request: HttpRequest, queryset: QuerySet):
"""Update all table neighbors"""
update_table_neighbors_task()


update_table_neighbors.short_description = "Atualizar os vizinhos das tabelas"


def reorder_tables(modeladmin, request, queryset):
"""Reorder tables in respect to dataset"""

Expand Down Expand Up @@ -513,6 +523,7 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin):
reorder_columns,
reset_column_order,
update_table_metadata,
update_table_neighbors,
update_page_views,
]
inlines = [
Expand Down Expand Up @@ -635,6 +646,26 @@ def add_view(self, request, *args, **kwargs):
return super().add_view(request, *args, **kwargs)


class TableNeighborAdmin(admin.ModelAdmin):
search_fields = [
"table_a__name",
"table_b__name",
]
list_filter = [
"table_a",
"table_b",
]
list_display = [
"table_a",
"table_b",
"similarity",
"similarity_of_area",
"similarity_of_datetime",
"similarity_of_directory",
]
ordering = ["table_a", "table_b"]


class ColumnForm(forms.ModelForm):
class Meta:
model = Column
Expand Down Expand Up @@ -1043,6 +1074,7 @@ class QualityCheckAdmin(TabbedTranslationAdmin):
admin.site.register(RawDataSource, RawDataSourceAdmin)
admin.site.register(Status, StatusAdmin)
admin.site.register(Table, TableAdmin)
admin.site.register(TableNeighbor, TableNeighborAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(Theme, ThemeAdmin)
admin.site.register(Update, UpdateAdmin)
Expand Down
50 changes: 50 additions & 0 deletions bd_api/apps/api/v1/graphql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-

from graphene import UUID, Boolean, Float, List, ObjectType, String
from graphene_django import DjangoObjectType

from bd_api.apps.api.v1.models import TableNeighbor
from bd_api.custom.graphql_base import PlainTextNode


class TableNeighborNode(DjangoObjectType):
"""Similiar tables and columns with filters"""

table_id = String()
table_name = String()
dataset_id = String()
dataset_name = String()
score = Float()

class Meta:
model = TableNeighbor
fields = ("id",)
filter_fields = ("id",)
interfaces = (PlainTextNode,)

def resolve__table_id(root, info):
return root.table_b.pk

def resolve__table_name(root, info):
return root.table_b.name

def resolve__dataset_id(root, info):
return root.table_b.dataset.pk

def resolve__dataset_name(root, info):
return root.table_b.dataset.name

def resolve_score(root, info):
return root.score


class APIQuery(ObjectType):
get_table_neighbor = List(
TableNeighborNode,
table_id=UUID(required=True),
theme=String(),
share_theme=Boolean(),
)

def resolve_get_table_neighbor(root, info, table_id, **kwargs):
return TableNeighbor.objects.filter(table_a__pk=table_id).all()
58 changes: 58 additions & 0 deletions bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
# Generated by Django 4.2.10 on 2024-03-20 11:53

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("v1", "0027_dataset_page_views_table_page_views"),
]

operations = [
migrations.CreateModel(
name="TableNeighbor",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("similarity", models.FloatField(default=0)),
("similarity_of_area", models.FloatField(default=0)),
("similarity_of_datetime", models.FloatField(default=0)),
("similarity_of_directory", models.FloatField(default=0)),
("similarity_of_popularity", models.FloatField(default=0)),
(
"table_a",
models.ForeignKey(
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="tableneighbor_a_set",
to="v1.table",
),
),
(
"table_b",
models.ForeignKey(
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="tableneighbor_b_set",
to="v1.table",
),
),
],
options={
"db_table": "table_neighbor",
},
),
migrations.AddConstraint(
model_name="tableneighbor",
constraint=models.UniqueConstraint(
fields=("table_a", "table_b"), name="table_neighbor_unique_constraint"
),
),
]
141 changes: 94 additions & 47 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,53 +1024,10 @@ def full_coverage(self) -> str:

@property
def neighbors(self) -> list[dict]:
"""Similiar tables and columns
- Tables and columns with similar directories
- Tables and columns with similar coverages or tags
"""
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
if not self_directories:
return []
all_tables = (
Table.objects
.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
) # fmt: skip
all_neighbors = []
for table in all_tables:
score_area = self.get_similarity_of_area(table)
score_datetime = self.get_similarity_of_datetime(table)
score_directory, columns = self.get_similarity_of_directory(table)
score_popularity = table.dataset.popularity
if not score_area or not score_datetime or not score_directory:
continue
column_id = []
column_name = []
for column in columns:
column_id.append(str(column.id))
column_name.append(column.name)
all_neighbors.append(
{
"column_id": column_id,
"column_name": column_name,
"table_id": str(table.id),
"table_name": table.name,
"dataset_id": str(table.dataset.id),
"dataset_name": table.dataset.name,
"score": round(score_directory, 2) + score_popularity,
}
)
return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20]
"""Similiar tables and columns without filters"""
all_neighbors = [t.as_dict for t in TableNeighbor.objects.filter(table_a=self)]
all_neighbors = sorted(all_neighbors, key=lambda item: item["score"], reverse=True)
return all_neighbors

@property
def last_updated_at(self):
Expand Down Expand Up @@ -1113,6 +1070,45 @@ def get_similarity_of_directory(self, other: "Table"):
intersection = self_directories.intersection(other_directories)
return len(intersection) / len(self_directories), intersection

def gen_neighbors(self) -> list[dict]:
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
if not self_directories:
return []
all_tables = (
Table.objects
.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
) # fmt: skip
all_neighbors = []
for table in all_tables:
similarity_of_area = self.get_similarity_of_area(table)
similarity_of_datetime = self.get_similarity_of_datetime(table)
similarity_of_directory, columns = self.get_similarity_of_directory(table)
similarity_of_popularity = table.dataset.popularity
if not similarity_of_area or not similarity_of_datetime or not similarity_of_directory:
continue
all_neighbors.append(
{
"table_a": self,
"table_b": table,
"similarity_of_area": similarity_of_area,
"similarity_of_datetime": similarity_of_datetime,
"similarity_of_directory": similarity_of_directory,
"similarity_of_popularity": similarity_of_popularity,
}
)
return all_neighbors

def clean(self):
"""
Clean method for Table model
Expand Down Expand Up @@ -1157,6 +1153,57 @@ def clean(self):
raise ValidationError(errors)


class TableNeighbor(BaseModel):
table_a = models.ForeignKey(
Table,
on_delete=models.DO_NOTHING,
related_name="tableneighbor_a_set",
)
table_b = models.ForeignKey(
Table,
on_delete=models.DO_NOTHING,
related_name="tableneighbor_b_set",
)

similarity = models.FloatField(default=0)
similarity_of_area = models.FloatField(default=0)
similarity_of_datetime = models.FloatField(default=0)
similarity_of_directory = models.FloatField(default=0)
similarity_of_popularity = models.FloatField(default=0)

class Meta:
db_table = "table_neighbor"
constraints = [
models.UniqueConstraint(
fields=["table_a", "table_b"],
name="table_neighbor_unique_constraint",
),
]

@property
def score(self):
return round(self.similarity_of_directory, 2) + round(self.similarity_of_popularity, 2)

@property
def as_dict(self):
return {
"table_id": str(self.table_b.pk),
"table_name": self.table_b.name,
"dataset_id": str(self.table_b.dataset.pk),
"dataset_name": self.table_b.dataset.name,
"score": self.score,
}

def clean(self) -> None:
errors = {}
if self.table_a.pk == self.table_b.pk:
errors["table_a"] = "Table neighbors A & B shouldn't be the same"
errors["table_b"] = "Table neighbors A & B shouldn't be the same"
if errors:
raise ValidationError(errors)
return super().clean()


class BigQueryType(BaseModel):
"""Model definition for BigQueryType."""

Expand Down
10 changes: 9 additions & 1 deletion bd_api/apps/api/v1/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas import read_gbq
from requests import get

from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table
from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table, TableNeighbor
from bd_api.custom.client import Messenger, get_gbq_client, get_gcs_client
from bd_api.custom.environment import production_task

Expand Down Expand Up @@ -117,6 +117,14 @@ def get_uncompressed_file_size(table: Table, bq_table: GBQTable) -> int | None:
messenger.send()


@periodic_task(crontab(day_of_week="0", hour="6", minute="0"))
@production_task
def update_table_neighbors_task():
for table in Table.objects.all():
for neighbor in table.gen_neighbors():
TableNeighbor.objects.update_or_create(**neighbor)


@periodic_task(crontab(day_of_week="1-5", hour="7", minute="0"))
@production_task
def update_page_views_task(backfill: bool = False):
Expand Down
2 changes: 2 additions & 0 deletions bd_api/apps/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from bd_api.apps.account.graphql import AccountMutation
from bd_api.apps.api.v1.graphql import APIQuery
from bd_api.apps.payment.graphql import (
StripeCustomerMutation,
StripePriceQuery,
Expand All @@ -11,6 +12,7 @@
schema = build_schema(
applications=["account", "v1"],
extra_queries=[
APIQuery,
StripePriceQuery,
],
extra_mutations=[
Expand Down

0 comments on commit 44f20d6

Please sign in to comment.