Skip to content

Commit

Permalink
feat: Add human gene matcher
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Nov 11, 2024
1 parent 8217711 commit fe84419
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 25 deletions.
76 changes: 76 additions & 0 deletions bdikit/value_matching/gene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import List
import pymart as pm
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch
from bdikit.config import VALUE_MATCHING_THRESHOLD


class Gene(BaseValueMatcher):
def __init__(
self,
source_species: str = "mmusculus",
target_species: str = "human",
dataset_name: str = "mmusculus_gene_ensembl",
top_k: int = 1,
threshold: float = VALUE_MATCHING_THRESHOLD,
):
hom_species = [source_species, target_species]
hom_query = ["ensembl_gene", "orthology_confidence", "perc_id"]

data = pm.fetch_data(
dataset_name=dataset_name, hom_species=hom_species, hom_query=hom_query
)

data = data[
[
"Gene stable ID",
"Human gene stable ID",
"%id. target Human gene identical to query gene",
]
]
data = data.drop_duplicates()
data.dropna(inplace=True)

# Convert the DataFrame to the dictionary format
self.matches = (
data.groupby("Gene stable ID")
.apply(
lambda group: sorted(
[
{
"match": row["Human gene stable ID"],
"similarity": row[
"%id. target Human gene identical to query gene"
],
}
for _, row in group.iterrows()
],
key=lambda x: x["similarity"], # Sort by similarity
reverse=True,
)
)
.to_dict()
)
self.threshold = threshold
self.top_k = top_k

def match(
self,
source_values: List[str],
target_values: List[str],
) -> List[ValueMatch]:
matches = []
target_values_set = set(target_values)

for source_value in source_values:
match_results = self.matches.get(source_value, [])
print(source_value, match_results)
for match_result in match_results[: self.top_k]:
score = match_result["similarity"] / 100
target = match_result["match"]
if score >= self.threshold:
if target in target_values_set:
matches.append(ValueMatch(source_value, target, score))
else:
break

return matches
1 change: 1 addition & 0 deletions bdikit/value_matching/matcher_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ValueMatchers(Enum):
"bdikit.value_matching.polyfuzz.FastTextValueMatcher",
)
GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")
GENE = ("gene", "bdikit.value_matching.gene.Gene")

def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
Expand Down
1 change: 1 addition & 0 deletions extra_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PyMart: genes
76 changes: 51 additions & 25 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,86 @@
import os
import re
import setuptools
from collections import defaultdict


package_name = 'bdi-kit'
package_dir = 'bdikit'
package_name = "bdi-kit"
package_dir = "bdikit"


def read_readme():
with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file:
with open(
os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf8"
) as file:
return file.read()


def read_version():
module_path = os.path.join(package_dir, '__init__.py')
module_path = os.path.join(package_dir, "__init__.py")
with open(module_path) as file:
for line in file:
parts = line.strip().split(' ')
if parts and parts[0] == '__version__':
return parts[-1].strip("'").strip("\"")
parts = line.strip().split(" ")
if parts and parts[0] == "__version__":
return parts[-1].strip("'").strip('"')

raise KeyError('Version not found in {0}'.format(module_path))
raise KeyError(f"Version not found in {module_path}")


def get_requires():
with open('requirements.txt') as fp:
dependencies = [line for line in fp if line and not line.startswith('#')]
with open("requirements.txt") as fp:
dependencies = [line for line in fp if line and not line.startswith("#")]

return dependencies


def get_extra_requires():
with open("extra_requirements.txt") as fp:
extra_dependencies = defaultdict(set)
for k in fp:
if k.strip() and not k.startswith("#"):
tags = set()
if ":" in k:
k, v = k.split(":")
tags.update(vv.strip() for vv in v.split(","))
tags.add(re.split("[<=>]", k)[0])
for t in tags:
extra_dependencies[t].add(k)

# Add tag `full` at the end
extra_dependencies["full"] = set(
vv for v in extra_dependencies.values() for vv in v
)

return extra_dependencies


long_description = read_readme()
version = read_version()
requires = get_requires()
extra_requires = {}
extra_requires = get_extra_requires()

setuptools.setup(
name=package_name,
version=version,
packages=setuptools.find_packages(),
install_requires=requires,
extras_require=extra_requires,
python_requires='>=3.9',
python_requires=">=3.9",
description="bdi-kit library",
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/VIDA-NYU/bdi-kit',
long_description_content_type="text/markdown",
url="https://github.com/VIDA-NYU/bdi-kit",
include_package_data=True,
author='',
author_email='',
maintainer='',
maintainer_email='',
keywords=['askem', 'data integration', 'nyu'],
license='Apache-2.0',
author="",
author_email="",
maintainer="",
maintainer_email="",
keywords=["bdf", "data integration", "nyu"],
license="Apache-2.0",
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Topic :: Scientific/Engineering',
])
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering",
],
)

0 comments on commit fe84419

Please sign in to comment.