From fe844197eef069e5e0c6129ca67753eb4d57b2d5 Mon Sep 17 00:00:00 2001
From: Roque Lopez <rlopezc27@gmail.com>
Date: Mon, 11 Nov 2024 17:13:59 -0500
Subject: [PATCH] feat: Add human gene matcher

---
 bdikit/value_matching/gene.py            | 76 ++++++++++++++++++++++++
 bdikit/value_matching/matcher_factory.py |  1 +
 extra_requirements.txt                   |  1 +
 setup.py                                 | 76 ++++++++++++++++--------
 4 files changed, 129 insertions(+), 25 deletions(-)
 create mode 100644 bdikit/value_matching/gene.py
 create mode 100644 extra_requirements.txt

diff --git a/bdikit/value_matching/gene.py b/bdikit/value_matching/gene.py
new file mode 100644
index 0000000..d2435ac
--- /dev/null
+++ b/bdikit/value_matching/gene.py
@@ -0,0 +1,76 @@
+from typing import List
+import pymart as pm
+from bdikit.value_matching.base import BaseValueMatcher, ValueMatch
+from bdikit.config import VALUE_MATCHING_THRESHOLD
+
+
+class Gene(BaseValueMatcher):
+    def __init__(
+        self,
+        source_species: str = "mmusculus",
+        target_species: str = "human",
+        dataset_name: str = "mmusculus_gene_ensembl",
+        top_k: int = 1,
+        threshold: float = VALUE_MATCHING_THRESHOLD,
+    ):
+        hom_species = [source_species, target_species]
+        hom_query = ["ensembl_gene", "orthology_confidence", "perc_id"]
+
+        data = pm.fetch_data(
+            dataset_name=dataset_name, hom_species=hom_species, hom_query=hom_query
+        )
+
+        data = data[
+            [
+                "Gene stable ID",
+                "Human gene stable ID",
+                "%id. target Human gene identical to query gene",
+            ]
+        ]
+        data = data.drop_duplicates()
+        data.dropna(inplace=True)
+
+        # Convert the DataFrame to the dictionary format
+        self.matches = (
+            data.groupby("Gene stable ID")
+            .apply(
+                lambda group: sorted(
+                    [
+                        {
+                            "match": row["Human gene stable ID"],
+                            "similarity": row[
+                                "%id. target Human gene identical to query gene"
+                            ],
+                        }
+                        for _, row in group.iterrows()
+                    ],
+                    key=lambda x: x["similarity"],  # Sort by similarity
+                    reverse=True,
+                )
+            )
+            .to_dict()
+        )
+        self.threshold = threshold
+        self.top_k = top_k
+
+    def match(
+        self,
+        source_values: List[str],
+        target_values: List[str],
+    ) -> List[ValueMatch]:
+        matches = []
+        target_values_set = set(target_values)
+
+        for source_value in source_values:
+            match_results = self.matches.get(source_value, [])
+            print(source_value, match_results)
+            for match_result in match_results[: self.top_k]:
+                score = match_result["similarity"] / 100
+                target = match_result["match"]
+                if score >= self.threshold:
+                    if target in target_values_set:
+                        matches.append(ValueMatch(source_value, target, score))
+                else:
+                    break
+
+        return matches
diff --git a/bdikit/value_matching/matcher_factory.py b/bdikit/value_matching/matcher_factory.py
index b121830..803a247 100644
--- a/bdikit/value_matching/matcher_factory.py
+++ b/bdikit/value_matching/matcher_factory.py
@@ -19,6 +19,7 @@ class ValueMatchers(Enum):
         "bdikit.value_matching.polyfuzz.FastTextValueMatcher",
     )
     GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")
+    GENE = ("gene", "bdikit.value_matching.gene.Gene")
 
     def __init__(self, matcher_name: str, matcher_path: str):
         self.matcher_name = matcher_name
diff --git a/extra_requirements.txt b/extra_requirements.txt
new file mode 100644
index 0000000..c8b6cd1
--- /dev/null
+++ b/extra_requirements.txt
@@ -0,0 +1 @@
+PyMart:                     genes
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6e80184..cba2128 100644
--- a/setup.py
+++ b/setup.py
@@ -1,38 +1,63 @@
 import os
+import re
 import setuptools
+from collections import defaultdict
 
 
-package_name = 'bdi-kit'
-package_dir = 'bdikit'
+package_name = "bdi-kit"
+package_dir = "bdikit"
 
 
 def read_readme():
-    with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file:
+    with open(
+        os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf8"
+    ) as file:
         return file.read()
 
 
 def read_version():
-    module_path = os.path.join(package_dir, '__init__.py')
+    module_path = os.path.join(package_dir, "__init__.py")
     with open(module_path) as file:
         for line in file:
-            parts = line.strip().split(' ')
-            if parts and parts[0] == '__version__':
-                return parts[-1].strip("'").strip("\"")
+            parts = line.strip().split(" ")
+            if parts and parts[0] == "__version__":
+                return parts[-1].strip("'").strip('"')
 
-    raise KeyError('Version not found in {0}'.format(module_path))
+    raise KeyError(f"Version not found in {module_path}")
 
 
 def get_requires():
-    with open('requirements.txt') as fp:
-        dependencies = [line for line in fp if line and not line.startswith('#')]
+    with open("requirements.txt") as fp:
+        dependencies = [line for line in fp if line and not line.startswith("#")]
 
         return dependencies
 
 
+def get_extra_requires():
+    with open("extra_requirements.txt") as fp:
+        extra_dependencies = defaultdict(set)
+        for k in fp:
+            if k.strip() and not k.startswith("#"):
+                tags = set()
+                if ":" in k:
+                    k, v = k.split(":")
+                    tags.update(vv.strip() for vv in v.split(","))
+                tags.add(re.split("[<=>]", k)[0])
+                for t in tags:
+                    extra_dependencies[t].add(k)
+
+        # Add tag `full` at the end
+        extra_dependencies["full"] = set(
+            vv for v in extra_dependencies.values() for vv in v
+        )
+
+    return extra_dependencies
+
+
 long_description = read_readme()
 version = read_version()
 requires = get_requires()
-extra_requires = {}
+extra_requires = get_extra_requires()
 
 setuptools.setup(
     name=package_name,
@@ -40,21 +65,22 @@ def get_requires():
     packages=setuptools.find_packages(),
     install_requires=requires,
     extras_require=extra_requires,
-    python_requires='>=3.9',
+    python_requires=">=3.9",
     description="bdi-kit library",
     long_description=long_description,
-    long_description_content_type='text/markdown',
-    url='https://github.com/VIDA-NYU/bdi-kit',
+    long_description_content_type="text/markdown",
+    url="https://github.com/VIDA-NYU/bdi-kit",
     include_package_data=True,
-    author='',
-    author_email='',
-    maintainer='',
-    maintainer_email='',
-    keywords=['askem', 'data integration', 'nyu'],
-    license='Apache-2.0',
+    author="",
+    author_email="",
+    maintainer="",
+    maintainer_email="",
+    keywords=["bdf", "data integration", "nyu"],
+    license="Apache-2.0",
     classifiers=[
-        'Development Status :: 5 - Production/Stable',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: Apache Software License',
-        'Topic :: Scientific/Engineering',
-    ])
\ No newline at end of file
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Topic :: Scientific/Engineering",
+    ],
+)