Merge pull request #47 from VIDA-NYU/value-mapping-algorithms

Adding AutoFuzzyJoin Algorithm for value mapping
VIDA-NYU · Jun 4, 2024 · 04c62b0 · 04c62b0
2 parents e87c17e + 9fa32ba
commit 04c62b0
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 0 deletions.
diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py
@@ -3,9 +3,13 @@
 from polyfuzz import PolyFuzz
 from polyfuzz.models import EditDistance, TFIDF, Embeddings
 from flair.embeddings import TransformerWordEmbeddings
+from autofj import AutoFJ
+from Levenshtein import ratio
+import pandas as pd
 
 
 class BaseAlgorithm:
+
     def __init__(self, *args):
         pass
 
@@ -48,6 +52,7 @@ def match(self, current_values, target_values, threshold=0.8):
 
 
 class EmbeddingAlgorithm(BaseAlgorithm):
+
     def __init__(self, model_path="bert-base-multilingual-cased"):
         embeddings = TransformerWordEmbeddings(model_path)
         method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model")
@@ -99,3 +104,40 @@ def match(self, current_values, target_values, threshold=0.8):
                 )
 
         return matches
+
+
+class AutoFuzzyJoinAlgorithm(BaseAlgorithm):
+
+    def __init__(self):
+        pass
+
+    def match(self, current_values, target_values, threshold=0.8):
+
+        current_values = sorted(list(set(current_values)))
+        target_values = sorted(list(set(target_values)))
+
+        df_curr_values = pd.DataFrame(
+            {"id": range(1, len(current_values) + 1), "title": current_values}
+        )
+        df_target_values = pd.DataFrame(
+            {"id": range(1, len(target_values) + 1), "title": target_values}
+        )
+
+        matches = []
+        try:
+            autofj = AutoFJ(
+                precision_target=threshold,
+                join_function_space="autofj_md",
+                verbose=True,
+            )
+            LR_joins = autofj.join(df_curr_values, df_target_values, id_column="id")
+            if len(LR_joins) > 0:
+                for index, row in LR_joins.iterrows():
+                    title_l = row["title_l"]
+                    title_r = row["title_r"]
+                    similarity = ratio(title_l, title_r)
+                    if similarity >= threshold:
+                        matches.append((title_l, title_r, similarity))
+        except Exception as e:
+            return matches
+        return matches
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ altair
 bokeh
 panel
 Levenshtein
+autofj