From 33616577eb7e5a77b9fca48aafc07aad3f5cec66 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Fri, 15 Mar 2024 15:56:00 -0300 Subject: [PATCH] feat: add table neighbor model --- bd_api/apps/api/v1/admin.py | 10 ++ .../migrations/0028_tableneighbor_and_more.py | 54 ++++++++ bd_api/apps/api/v1/models.py | 131 +++++++++++++----- bd_api/apps/api/v1/tasks.py | 10 +- 4 files changed, 167 insertions(+), 38 deletions(-) create mode 100644 bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py diff --git a/bd_api/apps/api/v1/admin.py b/bd_api/apps/api/v1/admin.py index fdcb16ae..1f16a4fc 100644 --- a/bd_api/apps/api/v1/admin.py +++ b/bd_api/apps/api/v1/admin.py @@ -66,6 +66,7 @@ update_page_views_task, update_search_index_task, update_table_metadata_task, + update_table_neighbors_task, ) from bd_api.custom.client import get_gbq_client @@ -262,6 +263,14 @@ def update_table_metadata(modeladmin: ModelAdmin, request: HttpRequest, queryset update_table_metadata.short_description = "Atualizar metadados das tabelas" +def update_table_neighbors(modeladmin: ModelAdmin, request: HttpRequest, queryset: QuerySet): + """Update all table neighbors""" + update_table_neighbors_task() + + +update_table_neighbors.short_description = "Atualizar os vizinhos das tabelas" + + def reorder_tables(modeladmin, request, queryset): """Reorder tables in respect to dataset""" @@ -513,6 +522,7 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin): reorder_columns, reset_column_order, update_table_metadata, + update_table_neighbors, update_page_views, ] inlines = [ diff --git a/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py b/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py new file mode 100644 index 00000000..8a05864e --- /dev/null +++ b/bd_api/apps/api/v1/migrations/0028_tableneighbor_and_more.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Generated by Django 4.2.10 on 2024-03-15 18:55 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("v1", "0027_dataset_page_views_table_page_views"), + ] + + operations = [ + migrations.CreateModel( + name="TableNeighbor", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("similarity", models.FloatField(default=0)), + ("similarity_of_area", models.FloatField(default=0)), + ("similarity_of_datetime", models.FloatField(default=0)), + ("similarity_of_directory", models.FloatField(default=0)), + ( + "table_a", + models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="tableneighbor_a_set", + to="v1.table", + ), + ), + ( + "table_b", + models.ForeignKey( + on_delete=django.db.models.deletion.DO_NOTHING, + related_name="tableneighbor_b_set", + to="v1.table", + ), + ), + ], + ), + migrations.AddConstraint( + model_name="tableneighbor", + constraint=models.UniqueConstraint( + fields=("table_a", "table_b"), name="table_neighbor_unique_constraint" + ), + ), + ] diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 68ad935b..541a92f1 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -8,6 +8,7 @@ from django.core.exceptions import ValidationError from django.db import models +from django.db.models import Q from django.urls import reverse from ordered_model.models import OrderedModel @@ -1024,53 +1025,25 @@ def full_coverage(self) -> str: @property def neighbors(self) -> list[dict]: - """Similiar tables and columns - - Tables and columns with similar directories - - Tables and columns with similar coverages or tags - """ - self_columns = ( - self.columns - .filter(directory_primary_key__isnull=False) - .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") - .all() - ) # fmt: skip - self_directories = set(c.directory_primary_key for c in self_columns) - if not self_directories: - return [] - all_tables = ( - Table.objects - .exclude(id=self.id) - .exclude(is_directory=True) - .exclude(status__slug__in=["under_review"]) - .filter(columns__directory_primary_key__isnull=False) - .distinct() - .all() - ) # fmt: skip + """Similiar tables and columns without filters""" all_neighbors = [] - for table in all_tables: - score_area = self.get_similarity_of_area(table) - score_datetime = self.get_similarity_of_datetime(table) - score_directory, columns = self.get_similarity_of_directory(table) + for neighbor in TableNeighbor.objects.filter(Q(table_a=self) | Q(table_b=self)).all(): + if neighbor.table_a == self: + table = neighbor.table_b + if neighbor.table_b == self: + table = neighbor.table_a + score_directory = neighbor.similarity_of_directory score_popularity = table.dataset.popularity - if not score_area or not score_datetime or not score_directory: - continue - column_id = [] - column_name = [] - for column in columns: - column_id.append(str(column.id)) - column_name.append(column.name) all_neighbors.append( { - "column_id": column_id, - "column_name": column_name, - "table_id": str(table.id), + "table_id": str(table.pk), "table_name": table.name, "dataset_id": str(table.dataset.id), "dataset_name": table.dataset.name, "score": round(score_directory, 2) + score_popularity, } ) - return sorted(all_neighbors, key=lambda item: item["score"])[::-1][:20] + return sorted(all_neighbors, key=lambda item: item["score"])[::-1] @property def last_updated_at(self): @@ -1113,6 +1086,45 @@ def get_similarity_of_directory(self, other: "Table"): intersection = self_directories.intersection(other_directories) return len(intersection) / len(self_directories), intersection + def get_neighbors(self) -> list[dict]: + self_columns = ( + self.columns + .filter(directory_primary_key__isnull=False) + .exclude(directory_primary_key__table__dataset__slug="diretorios_data_tempo") + .all() + ) # fmt: skip + self_directories = set(c.directory_primary_key for c in self_columns) + if not self_directories: + return [] + all_tables = ( + Table.objects + .exclude(id=self.id) + .exclude(is_directory=True) + .exclude(status__slug__in=["under_review"]) + .filter(columns__directory_primary_key__isnull=False) + .distinct() + .all() + ) # fmt: skip + all_neighbors = [] + for table in all_tables: + score_area = self.get_similarity_of_area(table) + score_datetime = self.get_similarity_of_datetime(table) + score_directory, columns = self.get_similarity_of_directory(table) + score_popularity = table.dataset.popularity + if not score_area or not score_datetime or not score_directory: + continue + all_neighbors.append( + { + "table_a": self, + "table_b": table, + "score_area": score_area, + "score_datetime": score_datetime, + "score_directory": score_directory, + "score_popularity": score_popularity, + } + ) + return all_neighbors + def clean(self): """ Clean method for Table model @@ -1157,6 +1169,51 @@ def clean(self): raise ValidationError(errors) +class TableNeighbor(BaseModel): + table_a = models.ForeignKey( + Table, + on_delete=models.DO_NOTHING, + related_name="tableneighbor_a_set", + ) + table_b = models.ForeignKey( + Table, + on_delete=models.DO_NOTHING, + related_name="tableneighbor_b_set", + ) + + similarity = models.FloatField(default=0) + similarity_of_area = models.FloatField(default=0) + similarity_of_datetime = models.FloatField(default=0) + similarity_of_directory = models.FloatField(default=0) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["table_a", "table_b"], + name="table_neighbor_unique_constraint", + ), + ] + + @property + def dict(self) -> dict: + return { + "similarity": self.similarity, + "similarity_of_area": self.similarity_of_area, + "similarity_of_datetime": self.similarity_of_datetime, + "similarity_of_directory": self.similarity_of_directory, + } + + def clean(self) -> None: + errors = {} + if self.table_a.pk > self.table_b.pk: + errors["order"] = "Table primary keys should be ordered" + if self.table_a.pk == self.table_b.pk: + errors["unique"] = "Table neighbors A & B shouldn't be the same" + if errors: + raise ValidationError(errors) + return super().clean() + + class BigQueryType(BaseModel): """Model definition for BigQueryType.""" diff --git a/bd_api/apps/api/v1/tasks.py b/bd_api/apps/api/v1/tasks.py index 3886df34..5e68c72e 100644 --- a/bd_api/apps/api/v1/tasks.py +++ b/bd_api/apps/api/v1/tasks.py @@ -10,7 +10,7 @@ from pandas import read_gbq from requests import get -from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table +from bd_api.apps.api.v1.models import Dataset, RawDataSource, Table, TableNeighbor from bd_api.custom.client import Messenger, get_gbq_client, get_gcs_client from bd_api.custom.environment import production_task @@ -117,6 +117,14 @@ def get_uncompressed_file_size(table: Table, bq_table: GBQTable) -> int | None: messenger.send() +@periodic_task(crontab(day_of_week="0", hour="6", minute="0")) +@production_task +def update_table_neighbors_task(): + for table in Table.objects.all(): + for neighbor in table.get_neighbors(): + TableNeighbor.objects.update_or_create(**neighbor) + + @periodic_task(crontab(day_of_week="1-5", hour="7", minute="0")) @production_task def update_page_views_task(backfill: bool = False):