diff --git a/bdikit/value_matching/gene.py b/bdikit/value_matching/gene.py new file mode 100644 index 0000000..d2435ac --- /dev/null +++ b/bdikit/value_matching/gene.py @@ -0,0 +1,76 @@ +from typing import List +import pymart as pm +from bdikit.value_matching.base import BaseValueMatcher, ValueMatch +from bdikit.config import VALUE_MATCHING_THRESHOLD + + +class Gene(BaseValueMatcher): + def __init__( + self, + source_species: str = "mmusculus", + target_species: str = "human", + dataset_name: str = "mmusculus_gene_ensembl", + top_k: int = 1, + threshold: float = VALUE_MATCHING_THRESHOLD, + ): + hom_species = [source_species, target_species] + hom_query = ["ensembl_gene", "orthology_confidence", "perc_id"] + + data = pm.fetch_data( + dataset_name=dataset_name, hom_species=hom_species, hom_query=hom_query + ) + + data = data[ + [ + "Gene stable ID", + "Human gene stable ID", + "%id. target Human gene identical to query gene", + ] + ] + data = data.drop_duplicates() + data.dropna(inplace=True) + + # Convert the DataFrame to the dictionary format + self.matches = ( + data.groupby("Gene stable ID") + .apply( + lambda group: sorted( + [ + { + "match": row["Human gene stable ID"], + "similarity": row[ + "%id. target Human gene identical to query gene" + ], + } + for _, row in group.iterrows() + ], + key=lambda x: x["similarity"], # Sort by similarity + reverse=True, + ) + ) + .to_dict() + ) + self.threshold = threshold + self.top_k = top_k + + def match( + self, + source_values: List[str], + target_values: List[str], + ) -> List[ValueMatch]: + matches = [] + target_values_set = set(target_values) + + for source_value in source_values: + match_results = self.matches.get(source_value, []) + print(source_value, match_results) + for match_result in match_results[: self.top_k]: + score = match_result["similarity"] / 100 + target = match_result["match"] + if score >= self.threshold: + if target in target_values_set: + matches.append(ValueMatch(source_value, target, score)) + else: + break + + return matches diff --git a/bdikit/value_matching/matcher_factory.py b/bdikit/value_matching/matcher_factory.py index b121830..803a247 100644 --- a/bdikit/value_matching/matcher_factory.py +++ b/bdikit/value_matching/matcher_factory.py @@ -19,6 +19,7 @@ class ValueMatchers(Enum): "bdikit.value_matching.polyfuzz.FastTextValueMatcher", ) GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher") + GENE = ("gene", "bdikit.value_matching.gene.Gene") def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name diff --git a/extra_requirements.txt b/extra_requirements.txt new file mode 100644 index 0000000..c8b6cd1 --- /dev/null +++ b/extra_requirements.txt @@ -0,0 +1 @@ +PyMart: genes \ No newline at end of file diff --git a/setup.py b/setup.py index 6e80184..cba2128 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,63 @@ import os +import re import setuptools +from collections import defaultdict -package_name = 'bdi-kit' -package_dir = 'bdikit' +package_name = "bdi-kit" +package_dir = "bdikit" def read_readme(): - with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: + with open( + os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf8" + ) as file: return file.read() def read_version(): - module_path = os.path.join(package_dir, '__init__.py') + module_path = os.path.join(package_dir, "__init__.py") with open(module_path) as file: for line in file: - parts = line.strip().split(' ') - if parts and parts[0] == '__version__': - return parts[-1].strip("'").strip("\"") + parts = line.strip().split(" ") + if parts and parts[0] == "__version__": + return parts[-1].strip("'").strip('"') - raise KeyError('Version not found in {0}'.format(module_path)) + raise KeyError(f"Version not found in {module_path}") def get_requires(): - with open('requirements.txt') as fp: - dependencies = [line for line in fp if line and not line.startswith('#')] + with open("requirements.txt") as fp: + dependencies = [line for line in fp if line and not line.startswith("#")] return dependencies +def get_extra_requires(): + with open("extra_requirements.txt") as fp: + extra_dependencies = defaultdict(set) + for k in fp: + if k.strip() and not k.startswith("#"): + tags = set() + if ":" in k: + k, v = k.split(":") + tags.update(vv.strip() for vv in v.split(",")) + tags.add(re.split("[<=>]", k)[0]) + for t in tags: + extra_dependencies[t].add(k) + + # Add tag `full` at the end + extra_dependencies["full"] = set( + vv for v in extra_dependencies.values() for vv in v + ) + + return extra_dependencies + + long_description = read_readme() version = read_version() requires = get_requires() -extra_requires = {} +extra_requires = get_extra_requires() setuptools.setup( name=package_name, @@ -40,21 +65,22 @@ def get_requires(): packages=setuptools.find_packages(), install_requires=requires, extras_require=extra_requires, - python_requires='>=3.9', + python_requires=">=3.9", description="bdi-kit library", long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/VIDA-NYU/bdi-kit', + long_description_content_type="text/markdown", + url="https://github.com/VIDA-NYU/bdi-kit", include_package_data=True, - author='', - author_email='', - maintainer='', - maintainer_email='', - keywords=['askem', 'data integration', 'nyu'], - license='Apache-2.0', + author="", + author_email="", + maintainer="", + maintainer_email="", + keywords=["bdf", "data integration", "nyu"], + license="Apache-2.0", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Topic :: Scientific/Engineering', - ]) \ No newline at end of file + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering", + ], +)