-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added missing file to previous commit and checked in cache for OMIM2M…
…ONDO
- Loading branch information
Showing
5 changed files
with
260 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
Timestamp: 20240821-140112 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=15970, misses=8315, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=90813, misses=14568, maxsize=16384, currsize=14568 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=29929, misses=17045, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=189862, misses=18113, maxsize=16384, currsize=16384 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=36854, misses=19422, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=220689, misses=18252, maxsize=16384, currsize=16384 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=56160, misses=25978, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=326321, misses=19439, maxsize=16384, currsize=16384 | ||
|
||
Timestamp: 20240822-193603 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=23718, misses=10045, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=118177, misses=5610, maxsize=16384, currsize=16384 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=53220, misses=27955, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=316509, misses=9986, maxsize=16384, currsize=16384 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=81216, misses=36794, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=425550, misses=10637, maxsize=16384, currsize=16384 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=100499, misses=43378, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=532067, misses=11525, maxsize=16384, currsize=16384 | ||
|
||
Timestamp: 20240828-114052 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=23726, misses=10037, maxsize=4096, currsize=4096 | ||
omim_mappings cache info: | ||
CacheInfo: hits=116389, misses=7706, maxsize=16384, currsize=16384 | ||
|
||
Timestamp: 20240903-201528 | ||
|
||
Timestamp: 20240904-111909 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=12774 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=74660, misses=6556, maxsize=524288, currsize=19330 | ||
omim_mappings cache info: | ||
CacheInfo: hits=64985, misses=8, maxsize=524288, currsize=20618 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=109269, misses=8782, maxsize=524288, currsize=21556 | ||
omim_mappings cache info: | ||
CacheInfo: hits=90157, misses=8, maxsize=524288, currsize=20618 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=134134, misses=9936, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=115413, misses=10, maxsize=524288, currsize=20620 | ||
|
||
Timestamp: 20240904-115833 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
Timestamp: 20240904-121924 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710 | ||
omim_mappings cache info: | ||
CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 | ||
|
||
Timestamp: 20240905-132835 | ||
|
||
gpt-4o/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=39813, misses=2626, maxsize=524288, currsize=25336 | ||
omim_mappings cache info: | ||
CacheInfo: hits=21842, misses=279, maxsize=524288, currsize=20899 | ||
|
||
gpt-4/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=97434, misses=5307, maxsize=524288, currsize=28017 | ||
omim_mappings cache info: | ||
CacheInfo: hits=53565, misses=936, maxsize=524288, currsize=21556 | ||
|
||
gpt-4-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=143644, misses=6097, maxsize=524288, currsize=28807 | ||
omim_mappings cache info: | ||
CacheInfo: hits=62521, misses=1074, maxsize=524288, currsize=21694 | ||
|
||
gpt-3.5-turbo/results.tsv | ||
score_grounded_result cache info: | ||
CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413 | ||
omim_mappings cache info: | ||
CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613 | ||
|
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# Let us try to parametrize how much is known about the diseases, there are two ideas beyond eval_diagnose_category, looking at the MONDO categories | ||
# Idea (0), (number of HPOs present, number of HPOs excluded) correlated to diseases found? | ||
# (1) HPOA and (2) Monarch KG | ||
# (1) Parse out disease genes discovered after 2008/9 (First thing in HPOA) | ||
# Look for a correlation between date annotated and disease correctly diagnosed. | ||
# Hypothesis: the older the easier to diagnose | ||
# (2) To start, looking at the two broad categories found/not-found, count average number of all links | ||
# After that, count average number of links of some kind | ||
# Then, something more graphy, such as, centrality? Maybe need to project out something first to find signal in the noise... | ||
import sys | ||
import pandas as pd | ||
import numpy as np | ||
import datetime as dt | ||
|
||
hpoa_file_path = "/Users/leonardo/IdeaProjects/maxodiff/data/phenotype.hpoa" | ||
hpoa_df = pd.read_csv( | ||
hpoa_file_path, sep="\t" , header=4 | ||
) | ||
|
||
hpoa_cleaned = pd.DataFrame() | ||
hpoa_cleaned["database_id"] = hpoa_df["database_id"] | ||
hpoa_cleaned['date'] = hpoa_df["biocuration"].str.extract(r'\[(.*?)\]') | ||
#string_dates = str(hpoa_df["biocuration"].str.extract(r'\[(.*?)\]')) | ||
# Mi sto un po attorcigliando, sarebbe da, semplicemente, fare un color coding | ||
#hpoa_cleaned['date'] = [dt.datetime.strptime(day, '%Y-%m-%d').date() for day in string_dates] | ||
hpoa_cleaned = hpoa_cleaned[hpoa_cleaned['database_id'].str.startswith("OMIM")] | ||
|
||
model = str(sys.argv[1]) | ||
ranking_results_filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv" | ||
rank_results_df = pd.read_csv( | ||
ranking_results_filename, sep="\t" | ||
) | ||
|
||
found_diseases = [] | ||
not_found_diseases = [] | ||
ppkts = rank_results_df.groupby("label")[["term", "correct_term", "is_correct"]] | ||
for ppkt in ppkts: | ||
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe | ||
disease = ppkt[1].iloc[0]['correct_term'] | ||
if any(ppkt[1]["is_correct"]): | ||
found_diseases.append(disease) | ||
else: | ||
not_found_diseases.append(disease) | ||
|
||
found_set = set(found_diseases) | ||
notfound_set = set(not_found_diseases) | ||
overlap = [] | ||
|
||
for i in found_set: | ||
if i in notfound_set: | ||
overlap.append(i) | ||
|
||
print(f"Number of found diseases by {model} is {len(found_set)}.") | ||
print(f"Number of not found diseases by {model} is {len(notfound_set)}.") | ||
print(f"Found diseases also present in not-found set, by {model} is {len(overlap)}.\n") | ||
# Need some more statistic | ||
|
||
# header = ["disease_id", "found", "date"] | ||
|
||
# Problematic, goes from 27 k unique values to 8.2k | ||
hpoa_cleaned = hpoa_cleaned.drop_duplicates(subset='database_id') | ||
# Idea here could be to look at the 263-129 (gpt-4o) found diseases not present in not found set and the opposite | ||
# namely never found diseases and look for a correlation with date. | ||
always_found = found_set - notfound_set # 134 | ||
never_found = notfound_set - found_set # 213 | ||
|
||
results_dict = {} # turns out being 281 long | ||
found_dict = {} | ||
notfound_dict = {} | ||
|
||
# TODO | ||
results_df = pd.DataFrame(columns=["disease", "found", "date"]) | ||
|
||
for af in always_found: | ||
try: | ||
results_dict[af] = [True, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() ] | ||
found_dict[af] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() | ||
results_df | ||
except ValueError: | ||
print(f"No HPOA for {af}.") | ||
for nf in never_found: | ||
try: | ||
results_dict[nf] = [False, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == nf, 'date'].item() ] | ||
notfound_dict[nf] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() | ||
except ValueError: | ||
print(f"No HPOA for {nf}.") | ||
|
||
res_to_clean = pd.DataFrame.from_dict(results_dict).transpose() | ||
res_to_clean.columns=["found","date"] | ||
res_to_clean.date = pd.to_datetime(res_to_clean.date).values.astype(np.int64) | ||
final_avg = pd.DataFrame(pd.to_datetime(res_to_clean.groupby('found').mean().date)) | ||
print(final_avg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters