-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8217711
commit fe84419
Showing
4 changed files
with
129 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from typing import List | ||
import pymart as pm | ||
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch | ||
from bdikit.config import VALUE_MATCHING_THRESHOLD | ||
|
||
|
||
class Gene(BaseValueMatcher): | ||
def __init__( | ||
self, | ||
source_species: str = "mmusculus", | ||
target_species: str = "human", | ||
dataset_name: str = "mmusculus_gene_ensembl", | ||
top_k: int = 1, | ||
threshold: float = VALUE_MATCHING_THRESHOLD, | ||
): | ||
hom_species = [source_species, target_species] | ||
hom_query = ["ensembl_gene", "orthology_confidence", "perc_id"] | ||
|
||
data = pm.fetch_data( | ||
dataset_name=dataset_name, hom_species=hom_species, hom_query=hom_query | ||
) | ||
|
||
data = data[ | ||
[ | ||
"Gene stable ID", | ||
"Human gene stable ID", | ||
"%id. target Human gene identical to query gene", | ||
] | ||
] | ||
data = data.drop_duplicates() | ||
data.dropna(inplace=True) | ||
|
||
# Convert the DataFrame to the dictionary format | ||
self.matches = ( | ||
data.groupby("Gene stable ID") | ||
.apply( | ||
lambda group: sorted( | ||
[ | ||
{ | ||
"match": row["Human gene stable ID"], | ||
"similarity": row[ | ||
"%id. target Human gene identical to query gene" | ||
], | ||
} | ||
for _, row in group.iterrows() | ||
], | ||
key=lambda x: x["similarity"], # Sort by similarity | ||
reverse=True, | ||
) | ||
) | ||
.to_dict() | ||
) | ||
self.threshold = threshold | ||
self.top_k = top_k | ||
|
||
def match( | ||
self, | ||
source_values: List[str], | ||
target_values: List[str], | ||
) -> List[ValueMatch]: | ||
matches = [] | ||
target_values_set = set(target_values) | ||
|
||
for source_value in source_values: | ||
match_results = self.matches.get(source_value, []) | ||
print(source_value, match_results) | ||
for match_result in match_results[: self.top_k]: | ||
score = match_result["similarity"] / 100 | ||
target = match_result["match"] | ||
if score >= self.threshold: | ||
if target in target_values_set: | ||
matches.append(ValueMatch(source_value, target, score)) | ||
else: | ||
break | ||
|
||
return matches |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
PyMart: genes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,60 +1,86 @@ | ||
import os | ||
import re | ||
import setuptools | ||
from collections import defaultdict | ||
|
||
|
||
package_name = 'bdi-kit' | ||
package_dir = 'bdikit' | ||
package_name = "bdi-kit" | ||
package_dir = "bdikit" | ||
|
||
|
||
def read_readme(): | ||
with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: | ||
with open( | ||
os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf8" | ||
) as file: | ||
return file.read() | ||
|
||
|
||
def read_version(): | ||
module_path = os.path.join(package_dir, '__init__.py') | ||
module_path = os.path.join(package_dir, "__init__.py") | ||
with open(module_path) as file: | ||
for line in file: | ||
parts = line.strip().split(' ') | ||
if parts and parts[0] == '__version__': | ||
return parts[-1].strip("'").strip("\"") | ||
parts = line.strip().split(" ") | ||
if parts and parts[0] == "__version__": | ||
return parts[-1].strip("'").strip('"') | ||
|
||
raise KeyError('Version not found in {0}'.format(module_path)) | ||
raise KeyError(f"Version not found in {module_path}") | ||
|
||
|
||
def get_requires(): | ||
with open('requirements.txt') as fp: | ||
dependencies = [line for line in fp if line and not line.startswith('#')] | ||
with open("requirements.txt") as fp: | ||
dependencies = [line for line in fp if line and not line.startswith("#")] | ||
|
||
return dependencies | ||
|
||
|
||
def get_extra_requires(): | ||
with open("extra_requirements.txt") as fp: | ||
extra_dependencies = defaultdict(set) | ||
for k in fp: | ||
if k.strip() and not k.startswith("#"): | ||
tags = set() | ||
if ":" in k: | ||
k, v = k.split(":") | ||
tags.update(vv.strip() for vv in v.split(",")) | ||
tags.add(re.split("[<=>]", k)[0]) | ||
for t in tags: | ||
extra_dependencies[t].add(k) | ||
|
||
# Add tag `full` at the end | ||
extra_dependencies["full"] = set( | ||
vv for v in extra_dependencies.values() for vv in v | ||
) | ||
|
||
return extra_dependencies | ||
|
||
|
||
long_description = read_readme() | ||
version = read_version() | ||
requires = get_requires() | ||
extra_requires = {} | ||
extra_requires = get_extra_requires() | ||
|
||
setuptools.setup( | ||
name=package_name, | ||
version=version, | ||
packages=setuptools.find_packages(), | ||
install_requires=requires, | ||
extras_require=extra_requires, | ||
python_requires='>=3.9', | ||
python_requires=">=3.9", | ||
description="bdi-kit library", | ||
long_description=long_description, | ||
long_description_content_type='text/markdown', | ||
url='https://github.com/VIDA-NYU/bdi-kit', | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/VIDA-NYU/bdi-kit", | ||
include_package_data=True, | ||
author='', | ||
author_email='', | ||
maintainer='', | ||
maintainer_email='', | ||
keywords=['askem', 'data integration', 'nyu'], | ||
license='Apache-2.0', | ||
author="", | ||
author_email="", | ||
maintainer="", | ||
maintainer_email="", | ||
keywords=["bdf", "data integration", "nyu"], | ||
license="Apache-2.0", | ||
classifiers=[ | ||
'Development Status :: 5 - Production/Stable', | ||
'Intended Audience :: Science/Research', | ||
'License :: OSI Approved :: Apache Software License', | ||
'Topic :: Scientific/Engineering', | ||
]) | ||
"Development Status :: 5 - Production/Stable", | ||
"Intended Audience :: Science/Research", | ||
"License :: OSI Approved :: Apache Software License", | ||
"Topic :: Scientific/Engineering", | ||
], | ||
) |