Skip to content

Commit

Permalink
added missing file to previous commit and checked in cache for OMIM2M…
Browse files Browse the repository at this point in the history
…ONDO
  • Loading branch information
leokim-l committed Sep 10, 2024
1 parent 1e70ff6 commit 76e0fb3
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 1 deletion.
166 changes: 166 additions & 0 deletions caches/cache_log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
Timestamp: 20240821-140112

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=15970, misses=8315, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=90813, misses=14568, maxsize=16384, currsize=14568

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=29929, misses=17045, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=189862, misses=18113, maxsize=16384, currsize=16384

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=36854, misses=19422, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=220689, misses=18252, maxsize=16384, currsize=16384

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=56160, misses=25978, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=326321, misses=19439, maxsize=16384, currsize=16384

Timestamp: 20240822-193603

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=23718, misses=10045, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=118177, misses=5610, maxsize=16384, currsize=16384

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=53220, misses=27955, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=316509, misses=9986, maxsize=16384, currsize=16384

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=81216, misses=36794, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=425550, misses=10637, maxsize=16384, currsize=16384

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=100499, misses=43378, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=532067, misses=11525, maxsize=16384, currsize=16384

Timestamp: 20240828-114052

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=23726, misses=10037, maxsize=4096, currsize=4096
omim_mappings cache info:
CacheInfo: hits=116389, misses=7706, maxsize=16384, currsize=16384

Timestamp: 20240903-201528

Timestamp: 20240904-111909

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=12774
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=74660, misses=6556, maxsize=524288, currsize=19330
omim_mappings cache info:
CacheInfo: hits=64985, misses=8, maxsize=524288, currsize=20618

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=109269, misses=8782, maxsize=524288, currsize=21556
omim_mappings cache info:
CacheInfo: hits=90157, misses=8, maxsize=524288, currsize=20618

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=134134, misses=9936, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=115413, misses=10, maxsize=524288, currsize=20620

Timestamp: 20240904-115833

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

Timestamp: 20240904-121924

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710
omim_mappings cache info:
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0

Timestamp: 20240905-132835

gpt-4o/results.tsv
score_grounded_result cache info:
CacheInfo: hits=39813, misses=2626, maxsize=524288, currsize=25336
omim_mappings cache info:
CacheInfo: hits=21842, misses=279, maxsize=524288, currsize=20899

gpt-4/results.tsv
score_grounded_result cache info:
CacheInfo: hits=97434, misses=5307, maxsize=524288, currsize=28017
omim_mappings cache info:
CacheInfo: hits=53565, misses=936, maxsize=524288, currsize=21556

gpt-4-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=143644, misses=6097, maxsize=524288, currsize=28807
omim_mappings cache info:
CacheInfo: hits=62521, misses=1074, maxsize=524288, currsize=21694

gpt-3.5-turbo/results.tsv
score_grounded_result cache info:
CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413
omim_mappings cache info:
CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613

Binary file added caches/omim_mappings_cache.db
Binary file not shown.
Binary file added caches/score_grounded_result_cache.db
Binary file not shown.
92 changes: 92 additions & 0 deletions src/malco/analysis/disease_avail_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Let us try to parametrize how much is known about the diseases, there are two ideas beyond eval_diagnose_category, looking at the MONDO categories
# Idea (0), (number of HPOs present, number of HPOs excluded) correlated to diseases found?
# (1) HPOA and (2) Monarch KG
# (1) Parse out disease genes discovered after 2008/9 (First thing in HPOA)
# Look for a correlation between date annotated and disease correctly diagnosed.
# Hypothesis: the older the easier to diagnose
# (2) To start, looking at the two broad categories found/not-found, count average number of all links
# After that, count average number of links of some kind
# Then, something more graphy, such as, centrality? Maybe need to project out something first to find signal in the noise...
import sys
import pandas as pd
import numpy as np
import datetime as dt

hpoa_file_path = "/Users/leonardo/IdeaProjects/maxodiff/data/phenotype.hpoa"
hpoa_df = pd.read_csv(
hpoa_file_path, sep="\t" , header=4
)

hpoa_cleaned = pd.DataFrame()
hpoa_cleaned["database_id"] = hpoa_df["database_id"]
hpoa_cleaned['date'] = hpoa_df["biocuration"].str.extract(r'\[(.*?)\]')
#string_dates = str(hpoa_df["biocuration"].str.extract(r'\[(.*?)\]'))
# Mi sto un po attorcigliando, sarebbe da, semplicemente, fare un color coding
#hpoa_cleaned['date'] = [dt.datetime.strptime(day, '%Y-%m-%d').date() for day in string_dates]
hpoa_cleaned = hpoa_cleaned[hpoa_cleaned['database_id'].str.startswith("OMIM")]

model = str(sys.argv[1])
ranking_results_filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
rank_results_df = pd.read_csv(
ranking_results_filename, sep="\t"
)

found_diseases = []
not_found_diseases = []
ppkts = rank_results_df.groupby("label")[["term", "correct_term", "is_correct"]]
for ppkt in ppkts:
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
disease = ppkt[1].iloc[0]['correct_term']
if any(ppkt[1]["is_correct"]):
found_diseases.append(disease)
else:
not_found_diseases.append(disease)

found_set = set(found_diseases)
notfound_set = set(not_found_diseases)
overlap = []

for i in found_set:
if i in notfound_set:
overlap.append(i)

print(f"Number of found diseases by {model} is {len(found_set)}.")
print(f"Number of not found diseases by {model} is {len(notfound_set)}.")
print(f"Found diseases also present in not-found set, by {model} is {len(overlap)}.\n")
# Need some more statistic

# header = ["disease_id", "found", "date"]

# Problematic, goes from 27 k unique values to 8.2k
hpoa_cleaned = hpoa_cleaned.drop_duplicates(subset='database_id')
# Idea here could be to look at the 263-129 (gpt-4o) found diseases not present in not found set and the opposite
# namely never found diseases and look for a correlation with date.
always_found = found_set - notfound_set # 134
never_found = notfound_set - found_set # 213

results_dict = {} # turns out being 281 long
found_dict = {}
notfound_dict = {}

# TODO
results_df = pd.DataFrame(columns=["disease", "found", "date"])

for af in always_found:
try:
results_dict[af] = [True, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() ]
found_dict[af] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item()
results_df
except ValueError:
print(f"No HPOA for {af}.")
for nf in never_found:
try:
results_dict[nf] = [False, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == nf, 'date'].item() ]
notfound_dict[nf] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item()
except ValueError:
print(f"No HPOA for {nf}.")

res_to_clean = pd.DataFrame.from_dict(results_dict).transpose()
res_to_clean.columns=["found","date"]
res_to_clean.date = pd.to_datetime(res_to_clean.date).values.astype(np.int64)
final_avg = pd.DataFrame(pd.to_datetime(res_to_clean.groupby('found').mean().date))
print(final_avg)
3 changes: 2 additions & 1 deletion src/malco/post_process/ranking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def compute_mrr_and_ranks(
) -> Path:

# Read in results TSVs from self.output_dir that match glob results*tsv
out_caches = output_dir / "caches"
out_caches = Path("caches")
#out_caches = output_dir / "caches"
out_caches.mkdir(exist_ok=True)
output_dir = output_dir / out_subdir
results_data = []
Expand Down

0 comments on commit 76e0fb3

Please sign in to comment.