Skip to content

Commit

Permalink
Merge pull request #47 from VIDA-NYU/value-mapping-algorithms
Browse files Browse the repository at this point in the history
Adding AutoFuzzyJoin Algorithm for value mapping
  • Loading branch information
EduardoPena authored Jun 4, 2024
2 parents e87c17e + 9fa32ba commit 04c62b0
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
42 changes: 42 additions & 0 deletions bdikit/mapping_algorithms/value_mapping/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
from polyfuzz import PolyFuzz
from polyfuzz.models import EditDistance, TFIDF, Embeddings
from flair.embeddings import TransformerWordEmbeddings
from autofj import AutoFJ
from Levenshtein import ratio
import pandas as pd


class BaseAlgorithm:

def __init__(self, *args):
pass

Expand Down Expand Up @@ -48,6 +52,7 @@ def match(self, current_values, target_values, threshold=0.8):


class EmbeddingAlgorithm(BaseAlgorithm):

def __init__(self, model_path="bert-base-multilingual-cased"):
embeddings = TransformerWordEmbeddings(model_path)
method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model")
Expand Down Expand Up @@ -99,3 +104,40 @@ def match(self, current_values, target_values, threshold=0.8):
)

return matches


class AutoFuzzyJoinAlgorithm(BaseAlgorithm):

def __init__(self):
pass

def match(self, current_values, target_values, threshold=0.8):

current_values = sorted(list(set(current_values)))
target_values = sorted(list(set(target_values)))

df_curr_values = pd.DataFrame(
{"id": range(1, len(current_values) + 1), "title": current_values}
)
df_target_values = pd.DataFrame(
{"id": range(1, len(target_values) + 1), "title": target_values}
)

matches = []
try:
autofj = AutoFJ(
precision_target=threshold,
join_function_space="autofj_md",
verbose=True,
)
LR_joins = autofj.join(df_curr_values, df_target_values, id_column="id")
if len(LR_joins) > 0:
for index, row in LR_joins.iterrows():
title_l = row["title_l"]
title_r = row["title_r"]
similarity = ratio(title_l, title_r)
if similarity >= threshold:
matches.append((title_l, title_r, similarity))
except Exception as e:
return matches
return matches
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ altair
bokeh
panel
Levenshtein
autofj

0 comments on commit 04c62b0

Please sign in to comment.