Skip to content

Commit

Permalink
feat: add table neighbor model
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Mar 15, 2024
1 parent d5d28f9 commit 3361657
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 38 deletions.
10 changes: 10 additions & 0 deletions bd_api/apps/api/v1/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
update_page_views_task,
update_search_index_task,
update_table_metadata_task,
update_table_neighbors_task,
)
from bd_api.custom.client import get_gbq_client

Expand Down Expand Up @@ -262,6 +263,14 @@ def update_table_metadata(modeladmin: ModelAdmin, request: HttpRequest, queryset
update_table_metadata.short_description = "Atualizar metadados das tabelas"


def update_table_neighbors(modeladmin: ModelAdmin, request: HttpRequest, queryset: QuerySet):
"""Update all table neighbors"""
update_table_neighbors_task()


update_table_neighbors.short_description = "Atualizar os vizinhos das tabelas"


def reorder_tables(modeladmin, request, queryset):
"""Reorder tables in respect to dataset"""

Expand Down Expand Up @@ -513,6 +522,7 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin):
reorder_columns,
reset_column_order,
update_table_metadata,
update_table_neighbors,
update_page_views,
]
inlines = [
Expand Down
54 changes: 54 additions & 0 deletions bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
# Generated by Django 4.2.10 on 2024-03-15 18:55

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("v1", "0027_dataset_page_views_table_page_views"),
]

operations = [
migrations.CreateModel(
name="TableNeighbor",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("similarity", models.FloatField(default=0)),
("similarity_of_area", models.FloatField(default=0)),
("similarity_of_datetime", models.FloatField(default=0)),
("similarity_of_directory", models.FloatField(default=0)),
(
"table_a",
models.ForeignKey(
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="tableneighbor_a_set",
to="v1.table",
),
),
(
"table_b",
models.ForeignKey(
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="tableneighbor_b_set",
to="v1.table",
),
),
],
),
migrations.AddConstraint(
model_name="tableneighbor",
constraint=models.UniqueConstraint(
fields=("table_a", "table_b"), name="table_neighbor_unique_constraint"
),
),
]
131 changes: 94 additions & 37 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import Q
from django.urls import reverse
from ordered_model.models import OrderedModel

Expand Down Expand Up @@ -1024,53 +1025,25 @@ def full_coverage(self) -> str:

@property
def neighbors(self) -> list[dict]:
"""Similiar tables and columns
- Tables and columns with similar directories
- Tables and columns with similar coverages or tags
"""
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
if not self_directories:
return []
all_tables = (
Table.objects
.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
) # fmt: skip
"""Similiar tables and columns without filters"""
all_neighbors = []
for table in all_tables:
score_area = self.get_similarity_of_area(table)
score_datetime = self.get_similarity_of_datetime(table)
score_directory, columns = self.get_similarity_of_directory(table)
for neighbor in TableNeighbor.objects.filter(Q(table_a=self) | Q(table_b=self)).all():
if neighbor.table_a == self:
table = neighbor.table_b
if neighbor.table_b == self:
table = neighbor.table_a
score_directory = neighbor.similarity_of_directory
score_popularity = table.dataset.popularity
if not score_area or not score_datetime or not score_directory:
continue
column_id = []
column_name = []
for column in columns:
column_id.append(str(column.id))
column_name.append(column.name)
all_neighbors.append(
{
"column_id": column_id,
"column_name": column_name,
"table_id": str(table.id),
"table_id": str(table.pk),
"table_name": table.name,
"dataset_id": str(table.dataset.id),
"dataset_name": table.dataset.name,
"score": round(score_directory, 2) + score_popularity,
}
)
return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20]
return sorted(all_neighbors, key=lambda item: item["score"])[::-1]

@property
def last_updated_at(self):
Expand Down Expand Up @@ -1113,6 +1086,45 @@ def get_similarity_of_directory(self, other: "Table"):
intersection = self_directories.intersection(other_directories)
return len(intersection) / len(self_directories), intersection

def get_neighbors(self) -> list[dict]:
self_columns = (
self.columns
.filter(directory_primary_key__isnull=False)
.exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo")
.all()
) # fmt: skip
self_directories = set(c.directory_primary_key for c in self_columns)
if not self_directories:
return []
all_tables = (
Table.objects
.exclude(id=self.id)
.exclude(is_directory=True)
.exclude(status__slug__in=["under_review"])
.filter(columns__directory_primary_key__isnull=False)
.distinct()
.all()
) # fmt: skip
all_neighbors = []
for table in all_tables:
score_area = self.get_similarity_of_area(table)
score_datetime = self.get_similarity_of_datetime(table)
score_directory, columns = self.get_similarity_of_directory(table)
score_popularity = table.dataset.popularity
if not score_area or not score_datetime or not score_directory:
continue
all_neighbors.append(
{
"table_a": self,
"table_b": table,
"score_area": score_area,
"score_datetime": score_datetime,
"score_directory": score_directory,
"score_popularity": score_popularity,
}
)
return all_neighbors

def clean(self):
"""
Clean method for Table model
Expand Down Expand Up @@ -1157,6 +1169,51 @@ def clean(self):
raise ValidationError(errors)


class TableNeighbor(BaseModel):
table_a = models.ForeignKey(
Table,
on_delete=models.DO_NOTHING,
related_name="tableneighbor_a_set",
)
table_b = models.ForeignKey(
Table,
on_delete=models.DO_NOTHING,
related_name="tableneighbor_b_set",
)

similarity = models.FloatField(default=0)
similarity_of_area = models.FloatField(default=0)
similarity_of_datetime = models.FloatField(default=0)
similarity_of_directory = models.FloatField(default=0)

class Meta:
constraints = [
models.UniqueConstraint(
fields=["table_a", "table_b"],
name="table_neighbor_unique_constraint",
),
]

@property
def dict(self) -> dict:
return {
"similarity": self.similarity,
"similarity_of_area": self.similarity_of_area,
"similarity_of_datetime": self.similarity_of_datetime,
"similarity_of_directory": self.similarity_of_directory,
}

def clean(self) -> None:
errors = {}
if self.table_a.pk > self.table_b.pk:
errors["order"] = "Table primary keys should be ordered"
if self.table_a.pk == self.table_b.pk:
errors["unique"] = "Table neighbors A & B shouldn't be the same"
if errors:
raise ValidationError(errors)
return super().clean()


class BigQueryType(BaseModel):
"""Model definition for BigQueryType."""

Expand Down
10 changes: 9 additions & 1 deletion bd_api/apps/api/v1/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas import read_gbq
from requests import get

from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table
from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table, TableNeighbor
from bd_api.custom.client import Messenger, get_gbq_client, get_gcs_client
from bd_api.custom.environment import production_task

Expand Down Expand Up @@ -117,6 +117,14 @@ def get_uncompressed_file_size(table: Table, bq_table: GBQTable) -> int | None:
messenger.send()


@periodic_task(crontab(day_of_week="0", hour="6", minute="0"))
@production_task
def update_table_neighbors_task():
for table in Table.objects.all():
for neighbor in table.get_neighbors():
TableNeighbor.objects.update_or_create(**neighbor)


@periodic_task(crontab(day_of_week="1-5", hour="7", minute="0"))
@production_task
def update_page_views_task(backfill: bool = False):
Expand Down

0 comments on commit 3361657

Please sign in to comment.