feat: turn into similarity ranking

basedosdados · Feb 16, 2024 · 3da9cb6 · 3da9cb6
1 parent 5bde105
commit 3da9cb6
Showing 1 changed file with 74 additions and 96 deletions.
diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py
@@ -66,11 +66,6 @@ def get_date_time(date_times):
     )
 
 
-def get_unique_list(lst: list[dict]):
-    """Get unique list of dicts"""
-    return [dict(t) for t in {tuple(d.items()) for d in lst}]
-
-
 class UUIDHiddenIdForm(forms.ModelForm):
     """Form to include UUID in inline formes (Table, Column and Coverage)"""
 
@@ -218,21 +213,21 @@ def coverage_type(self):
 
     coverage_type.short_description = "Coverage Type"
 
-    def has_area_intersection(self, other: "Coverage"):
+    def similarity_of_area(self, other: "Coverage"):
         if not self.area:
-            return False
+            return 0
         if not other.area:
-            return False
+            return 0
         if self.area.name.startswith(other.area.name):
-            return True
+            return 1
         if other.area.name.startswith(self.area.name):
-            return True
-        return False
+            return 1
+        return 0
 
-    def has_datetime_intersection(self, other: "Coverage"):
+    def similarity_of_datetime(self, other: "Coverage"):
         for dt_self in self.datetime_ranges.all():
             for dt_other in other.datetime_ranges.all():
-                if dt_self.has_datetime_intersection(dt_other):
+                if dt_self.similarity_of_datetime(dt_other):
                     return True
         return False
 
@@ -1141,28 +1136,54 @@ def neighbors(self):
         - Tables and columns with similar directories
         - Tables and columns with similar coverages or tags
         """
+        all_tables = (
+            Table.objects.exclude(id=self.id)
+            .exclude(is_directory=True)
+            .exclude(status__slug__in=["under_review"])
+            .filter(columns__directory_primary_key__isnull=False)
+            .distinct()
+            .all()
+        )
         all_neighbors = []
-        for column in self.columns.all():
-            for neighbor in column.neighbors:
-                all_neighbors.append(neighbor)
-            if len(all_neighbors) >= 20:
-                break
-        return sorted(all_neighbors, key=lambda item: item[3])[::-1]
+        for table in all_tables:
+            score_area = self.similarity_of_area(table)
+            score_datetime = self.similarity_of_datetime(table)
+            score_directory, cols = self.similarity_of_directory(table)
+            if score_directory:
+                all_neighbors.append(
+                    (
+                        cols,
+                        table,
+                        table.dataset,
+                        score_area + score_datetime + score_directory,
+                    )
+                )
+                logger.debug(f"[similarity_area] {self} {table} {score_area}")
+                logger.debug(f"[similarity_datetime] {self} {table} {score_datetime}")
+                logger.debug(f"[similarity_directory] {self} {table} {score_directory}")
+
+        return sorted(all_neighbors, key=lambda item: item[-1])[::-1][:20]
 
     def get_graphql_neighbors(self) -> list[dict]:
         all_neighbors = []
-        for column, table, dataset, _ in self.neighbors:
+        for columns, table, dataset, score in self.neighbors:
+            column_id = []
+            column_name = []
+            for column in columns:
+                column_id.append(str(column.id))
+                column_name.append(column.name)
             all_neighbors.append(
                 {
-                    "column_id": str(column.id),
-                    "column_name": column.name,
+                    "column_id": column_id,
+                    "column_name": column_name,
                     "table_id": str(table.id),
                     "table_name": table.name,
                     "dataset_id": str(dataset.id),
                     "dataset_name": dataset.name,
+                    "score": score,
                 }
             )
-        return get_unique_list(all_neighbors)
+        return all_neighbors
 
     @property
     def last_updated_at(self):
@@ -1172,29 +1193,35 @@ def last_updated_at(self):
     def get_graphql_last_updated_at(self):
         return self.last_updated_at
 
-    def has_area_intersection(self, other: "Table"):
+    def similarity_of_area(self, other: "Table"):
+        count_all = 0
+        count_yes = 0
         for cov_self in self.coverages.all():
             for cov_other in other.coverages.all():
-                if cov_self.has_area_intersection(cov_other):
-                    logger.debug(f"[table_neighbor_by_area] {self.name} => {other.name}")
-                    return True
-        return False
-
-    def has_datetime_intersection(self, other: "Table"):
+                count_all += 1
+                if cov_self.similarity_of_area(cov_other):
+                    count_yes += 1
+        return count_yes / count_all if count_all else 0
+
+    def similarity_of_datetime(self, other: "Table"):
+        count_all = 0
+        count_yes = 0
         for cov_self in self.coverages.all():
             for cov_other in other.coverages.all():
-                if cov_self.has_datetime_intersection(cov_other):
-                    logger.debug(f"[table_neighbor_by_date] {self.name} => {other.name}")
-                    return True
-        return False
-
-    def has_directory_intersection(self, other: "Table"):
-        for col_self in self.columns.all():
-            for col_other in other.columns.all():
-                if col_self.has_directory_intersection(col_other):
-                    logger.debug(f"[table_neighbor_by_dire] {self.name} => {other.name}")
-                    return True
-        return False
+                count_all += 1
+                if cov_self.similarity_of_datetime(cov_other):
+                    count_yes = +1
+        return count_yes / count_all if count_all else 0
+
+    def similarity_of_directory(self, other: "Table"):
+        self_cols = self.columns.all()
+        self_dirs = self.columns.filter(directory_primary_key__isnull=False).all()
+        other_cols = other.columns.all()
+        other_dirs = other.columns.filter(directory_primary_key__isnull=False).all()
+        intersection = set([*self_dirs, *other_dirs])
+        intersection_size = len(intersection)
+        intersection_max_size = min(len(self_cols), len(other_cols))
+        return intersection_size / intersection_max_size, intersection
 
     def clean(self):
         """
@@ -1370,55 +1397,6 @@ def full_coverage(self) -> str:
     def get_graphql_full_coverage(self):
         return self.full_coverage
 
-    @property
-    def neighbors(self):
-        """Similiar tables and columns
-        - Columns with similar directories
-        - Columns with similar coverages or tags
-        """
-        if not self.directory_primary_key:
-            return []
-        all_columns = (
-            Column.objects.filter(directory_primary_key=self.directory_primary_key)
-            .exclude(id=self.id)
-            .all()
-        )
-        all_neighbors = []
-        for column in all_columns:
-            if self.table.has_area_intersection(column.table):
-                if self.table.has_datetime_intersection(column.table):
-                    all_neighbors.append(
-                        (
-                            column,
-                            column.table,
-                            column.table.dataset,
-                            column.table.dataset.page_views,
-                        )
-                    )
-            if len(all_neighbors) >= 20:
-                break
-        return sorted(all_neighbors, key=lambda item: item[3])[::-1]
-
-    def get_graphql_neighbors(self) -> list[dict]:
-        all_neighbors = []
-        for column, table, dataset, _ in self.neighbors:
-            all_neighbors.append(
-                {
-                    "column_id": str(column.id),
-                    "column_name": column.name,
-                    "table_id": str(table.id),
-                    "table_name": table.name,
-                    "dataset_id": str(dataset.id),
-                    "dataset_name": dataset.name,
-                }
-            )
-        return get_unique_list(all_neighbors)
-
-    def has_directory_intersection(self, other: "Column"):
-        if self.directory_primary_key == other.directory_primary_key:
-            return True
-        return False
-
     def clean(self) -> None:
         """Clean method for Column model"""
         errors = {}
@@ -1880,14 +1858,14 @@ def until(self):
                 self.end_second or 0,
             )
 
-    def has_datetime_intersection(self, other: "DateTimeRange"):
+    def similarity_of_datetime(self, other: "DateTimeRange"):
         if not self.since:
-            return False
+            return 0
         if not other.until:
-            return False
+            return 0
         if self.until >= other.since:
-            return True
-        return False
+            return 1
+        return 0
 
     def clean(self) -> None:
         """