Skip to content

Commit

Permalink
Merge pull request #22 from monarch-initiative/post_process_add_mondo…
Browse files Browse the repository at this point in the history
…_utils

Post process add mondo utils
  • Loading branch information
justaddcoffee authored Jun 6, 2024
2 parents 6c3dca5 + 22a249f commit 9fca933
Show file tree
Hide file tree
Showing 19 changed files with 1,389 additions and 80 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ venv/
__pycache__
data/*
prompts/*
inputdir
outputdir
.*db
inputdir/all_phenopackets.zip
inputdir/phenopacket-store/
.openai_cache.db
Binary file modified inputdir/phenopacket2prompt.jar
Binary file not shown.
26 changes: 13 additions & 13 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions src/malco/post_process/compute_mrr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import csv
from pathlib import Path
import pandas as pd
import pickle as pkl
from malco.post_process.mondo_score_utils import score_grounded_result


def compute_mrr(output_dir, prompt_dir, correct_answer_file) -> Path:
# Read in results TSVs from self.output_dir that match glob results*tsv
#TODO Leo: make more robust, had other results*tsv files from previous testing
# Proposal, go for exact file name match defined somewhere as global/static/immutable
results_data = []
results_files = []
num_ppkt = 0
for subdir, dirs, files in os.walk(output_dir):
for filename in files:
if filename.startswith("result") and filename.endswith(".tsv"):
file_path = os.path.join(subdir, filename)
df = pd.read_csv(file_path, sep="\t")
num_ppkt = df["label"].nunique()
results_data.append(df)
# Append both the subdirectory relative to output_dir and the filename
results_files.append(os.path.relpath(file_path, output_dir))
# Read in correct answers from prompt_dir
answers_path = os.path.join(os.getcwd(), prompt_dir, correct_answer_file)
answers = pd.read_csv(
answers_path, sep="\t", header=None, names=["description", "term", "label"]
)

# Mapping each label to its correct term
label_to_correct_term = answers.set_index("label")["term"].to_dict()
# Calculate the Mean Reciprocal Rank (MRR) for each file
mrr_scores = []
for df in results_data:
# For each label in the results file, find if the correct term is ranked
df["rank"] = df.groupby("label")["score"].rank(ascending=False, method="first")
label_4_non_eng = df["label"].str.replace("_[a-z][a-z]-prompt", "_en-prompt", regex=True)
df["correct_term"] = label_4_non_eng.map(label_to_correct_term)

# df['term'] is Mondo or OMIM ID, or even disease label
# df['correct_term'] is an OMIM
# call OAK and get OMIM IDs for df['term'] and see if df['correct_term'] is one of them
# in the case of phenotypic series, if Mondo corresponds to grouping term, accept it
df['is_correct'] = df.apply(
lambda row: score_grounded_result(row['term'], row['correct_term']) > 0,
axis=1)

# Calculate reciprocal rank
df["reciprocal_rank"] = df.apply(
lambda row: 1 / row["rank"] if row["is_correct"] else 0, axis=1
)
# Calculate MRR for this file
mrr = df.groupby("label")["reciprocal_rank"].max().mean()
mrr_scores.append(mrr)

print("MRR scores are:\n")
print(mrr_scores)
plot_dir = output_dir / "plots"
plot_dir.mkdir(exist_ok=True)
plot_data_file = plot_dir / "plotting_data.tsv"

# write out results for plotting
with plot_data_file.open('w', newline = '') as dat:
writer = csv.writer(dat, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
writer.writerow(results_files)
writer.writerow(mrr_scores)
return plot_data_file, plot_dir, num_ppkt
25 changes: 25 additions & 0 deletions src/malco/post_process/generate_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import seaborn as sns
import matplotlib.pyplot as plt
import os
import csv

# Make a nice plot, use it as function or as script

def make_plots(plot_data_file, plot_dir, languages, num_ppkt):
with plot_data_file.open('r', newline = '') as f:
lines = csv.reader(f, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
results_files = next(lines)
mrr_scores = next(lines)
#lines = f.read().splitlines()

print(results_files)
print(mrr_scores)

# Plotting the results
sns.barplot(x = results_files, y = mrr_scores)
plt.xlabel("Results File")
plt.ylabel("Mean Reciprocal Rank (MRR)")
plt.title("MRR of Correct Answers Across Different Results Files")
plot_path = plot_dir / (str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
plt.savefig(plot_path)
plt.show()
8 changes: 5 additions & 3 deletions src/malco/post_process/mondo_score_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
PARTIAL_SCORE = 0.5


@lru_cache
@lru_cache(maxsize=4096)
def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.
Expand All @@ -20,7 +20,7 @@ def mondo_adapter() -> OboGraphInterface:
return get_adapter("sqlite:obo:mondo")


@lru_cache()
@lru_cache(maxsize=1024)
def omim_mappings(term: str) -> List[str]:
"""
Get the OMIM mappings for a term.
Expand Down Expand Up @@ -81,7 +81,9 @@ def score_grounded_result(prediction: str, ground_truth: str) -> float:
# prediction is a MONDO that directly maps to a correct OMIM
return FULL_SCORE
mondo = mondo_adapter()
for mondo_descendant in mondo.descendants([prediction], predicates=[IS_A], reflexive=True):

descendants_list = mondo.descendants([prediction], predicates=[IS_A], reflexive=True)
for mondo_descendant in descendants_list:
if ground_truth in omim_mappings(mondo_descendant):
# prediction is a MONDO that maps to a correct OMIM via a descendant
return PARTIAL_SCORE
Expand Down
14 changes: 11 additions & 3 deletions src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from pathlib import Path

from malco.post_process.post_process_results_format import create_standardised_results
import os


def post_process(raw_results_dir: Path, output_dir: Path) -> None:
def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple) -> None:
"""
Post-process the raw results output to standardised PhEval TSV format.
Args:
raw_results_dir (Path): Path to the raw results directory.
output_dir (Path): Path to the output directory.
"""
create_standardised_results(raw_results_dir=raw_results_dir, output_dir=output_dir)

for lang in langs:
raw_results_lang = raw_results_dir / lang
output_lang = output_dir / lang
raw_results_lang.mkdir(exist_ok=True)
output_lang.mkdir(exist_ok=True)

create_standardised_results(raw_results_dir=raw_results_lang,
output_dir=output_lang, output_file_name = "results.tsv")
51 changes: 50 additions & 1 deletion src/malco/post_process/post_process_results_format.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,60 @@
import json
import os
from pathlib import Path
from typing import List

import pandas as pd
import yaml
from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
from pheval.utils.file_utils import all_files
from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.
Args:
raw_result_path(Path): Path to the raw result file.
Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result)) # Load and convert to list


def create_standardised_results(raw_results_dir: Path, output_dir: Path,
output_file_name: str) -> pd.DataFrame:
data = []
for raw_result_path in raw_results_dir.iterdir():
if raw_result_path.is_file():
all_results = read_raw_result_yaml(raw_result_path)

for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
terms = extracted_object.get('terms')
if terms:
num_terms = len(terms)
score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
rank_list = [ i+1 for i in range(num_terms)]
for term, scr, rank in zip(terms, score, rank_list):
data.append({'label': label, 'term': term, 'score': scr, 'rank': rank})

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to TSV
output_path = output_dir / output_file_name
df.to_csv(output_path, sep='\t', index=False)

return df


# these are from the template and not currently used outside of tests

def read_raw_result(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.
Expand Down Expand Up @@ -94,7 +142,7 @@ def extract_pheval_gene_requirements(self) -> List[PhEvalGeneResult]:
)
return pheval_result


'''
def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None:
"""
Create PhEval gene tsv output from raw results.
Expand All @@ -117,3 +165,4 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None
output_dir=output_dir,
tool_result_path=raw_result_path,
)
'''
34 changes: 34 additions & 0 deletions src/malco/prepare/setup_phenopackets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import zipfile
import os
import requests

phenopacket_zip_url="https://github.com/monarch-initiative/phenopacket-store/releases/download/0.1.11/all_phenopackets.zip"
# TODO just point to a folder w/ ppkts
phenopacket_dir="phenopacket-store"

def setup_phenopackets(self) -> str:
phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
if os.path.exists(phenopacket_store_path):
print(f"{phenopacket_store_path} exists, skipping download.")
else:
print(f"{phenopacket_store_path} doesn't exist, downloading phenopackets...")
download_phenopackets(self, phenopacket_zip_url, phenopacket_dir)
return phenopacket_store_path


def download_phenopackets(self, phenopacket_zip_url, phenopacket_dir):
# Ensure the directory for storing the phenopackets exists
phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
os.makedirs(phenopacket_store_path, exist_ok=True)

# Download the phenopacket release zip file
response = requests.get(phenopacket_zip_url)
zip_path = os.path.join(self.input_dir, "all_phenopackets.zip")
with open(zip_path, "wb") as f:
f.write(response.content)
print("Download completed.")

# Unzip the phenopacket release zip file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(phenopacket_store_path)
print("Unzip completed.")
29 changes: 9 additions & 20 deletions src/malco/run/run.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from pathlib import Path

from malco.run.run_tool import run_tool
from ontogpt.cli import run_multilingual_analysis
import os
# from ontogpt.cli import run_multilingual_analysis


def run(testdata_dir: Path, raw_results_dir: Path) -> None:
def run(testdata_dir: Path, raw_results_dir: Path, output_dir: Path,
langs: tuple) -> None:
"""
Run the tool to obtain the raw results.
Expand All @@ -14,19 +13,9 @@ def run(testdata_dir: Path, raw_results_dir: Path) -> None:
raw_results_dir: Path to the raw results directory.
"""
mydir = os.getcwd()
# TODO figure out how to run one language at a time, not like the next line
# lang_list = os.listdir(mydir + "prompts")
"""
run_multilingual_analysis(
input_data_dir=mydir + "prompts/en/PMID_23993194_Family_2_Case_2-prompt",
output_directory=mydir + "outputdir/",
output=mydir + "outputdir/" + "grounded_en", # TODO generalize lang
output_format="yaml",
model="gpt-4-turbo",
ext=".txt",
)
"""
os.system(
f"ontogpt run-multilingual-analysis --output={mydir}/outputdir/grounded_en/results.yaml --output-format=yaml {mydir}/prompts/et/ {mydir}outputdir/"
)
# run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)

for lang in langs:
os.system(
f"ontogpt -v run-multilingual-analysis --output={output_dir}/raw_results/{lang}/results.yaml {mydir}/prompts/{lang}/ {output_dir}/raw_results/{lang}/differentials_by_file/"
)
# run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)
Loading

0 comments on commit 9fca933

Please sign in to comment.