Merge pull request #22 from monarch-initiative/post_process_add_mondo…

…_utils Post process add mondo utils
monarch-initiative · Jun 6, 2024 · 9fca933 · 9fca933
2 parents 6c3dca5 + 22a249f
commit 9fca933
Show file tree

Hide file tree

Showing 19 changed files with 1,389 additions and 80 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ venv/
 __pycache__
 data/*
 prompts/*
-inputdir
 outputdir
+.*db
+inputdir/all_phenopackets.zip
+inputdir/phenopacket-store/
 .openai_cache.db
diff --git a/inputdir/phenopacket2prompt.jar b/inputdir/phenopacket2prompt.jar
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/malco/post_process/compute_mrr.py b/src/malco/post_process/compute_mrr.py
@@ -0,0 +1,68 @@
+import os 
+import csv
+from pathlib import Path
+import pandas as pd
+import pickle as pkl
+from malco.post_process.mondo_score_utils import score_grounded_result
+
+
+def compute_mrr(output_dir, prompt_dir, correct_answer_file) -> Path:
+    # Read in results TSVs from self.output_dir that match glob results*tsv 
+    #TODO Leo: make more robust, had other results*tsv files from previous testing
+    # Proposal, go for exact file name match defined somewhere as global/static/immutable 
+    results_data = []
+    results_files = []
+    num_ppkt = 0
+    for subdir, dirs, files in os.walk(output_dir):
+        for filename in files:
+            if filename.startswith("result") and filename.endswith(".tsv"):
+                file_path = os.path.join(subdir, filename)
+                df = pd.read_csv(file_path, sep="\t")
+                num_ppkt = df["label"].nunique()
+                results_data.append(df)
+                # Append both the subdirectory relative to output_dir and the filename
+                results_files.append(os.path.relpath(file_path, output_dir))
+    # Read in correct answers from prompt_dir
+    answers_path = os.path.join(os.getcwd(), prompt_dir, correct_answer_file)
+    answers = pd.read_csv(
+        answers_path, sep="\t", header=None, names=["description", "term", "label"]
+    )
+
+    # Mapping each label to its correct term
+    label_to_correct_term = answers.set_index("label")["term"].to_dict()
+    # Calculate the Mean Reciprocal Rank (MRR) for each file
+    mrr_scores = []
+    for df in results_data:
+        # For each label in the results file, find if the correct term is ranked
+        df["rank"] = df.groupby("label")["score"].rank(ascending=False, method="first")
+        label_4_non_eng = df["label"].str.replace("_[a-z][a-z]-prompt", "_en-prompt", regex=True)
+        df["correct_term"] = label_4_non_eng.map(label_to_correct_term)
+
+        # df['term'] is Mondo or OMIM ID, or even disease label
+        # df['correct_term'] is an OMIM
+        # call OAK and get OMIM IDs for df['term'] and see if df['correct_term'] is one of them
+        # in the case of phenotypic series, if Mondo corresponds to grouping term, accept it
+        df['is_correct'] = df.apply(
+            lambda row: score_grounded_result(row['term'], row['correct_term']) > 0,
+            axis=1)
+
+        # Calculate reciprocal rank
+        df["reciprocal_rank"] = df.apply(
+            lambda row: 1 / row["rank"] if row["is_correct"] else 0, axis=1
+        )
+        # Calculate MRR for this file
+        mrr = df.groupby("label")["reciprocal_rank"].max().mean()
+        mrr_scores.append(mrr)
+
+    print("MRR scores are:\n")
+    print(mrr_scores)
+    plot_dir = output_dir / "plots"
+    plot_dir.mkdir(exist_ok=True)
+    plot_data_file = plot_dir / "plotting_data.tsv"
+
+    # write out results for plotting 
+    with plot_data_file.open('w', newline = '') as dat:
+        writer = csv.writer(dat, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
+        writer.writerow(results_files)
+        writer.writerow(mrr_scores)
+    return plot_data_file, plot_dir, num_ppkt
diff --git a/src/malco/post_process/generate_plots.py b/src/malco/post_process/generate_plots.py
@@ -0,0 +1,25 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+import csv
+
+# Make a nice plot, use it as function or as script
+
+def make_plots(plot_data_file, plot_dir, languages, num_ppkt):
+    with plot_data_file.open('r', newline = '') as f:
+        lines = csv.reader(f, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
+        results_files = next(lines)
+        mrr_scores = next(lines)
+        #lines = f.read().splitlines()
+
+    print(results_files)
+    print(mrr_scores)
+
+    # Plotting the results
+    sns.barplot(x = results_files, y = mrr_scores)
+    plt.xlabel("Results File")
+    plt.ylabel("Mean Reciprocal Rank (MRR)")
+    plt.title("MRR of Correct Answers Across Different Results Files")
+    plot_path = plot_dir /  (str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
+    plt.savefig(plot_path)
+    plt.show()
diff --git a/src/malco/post_process/mondo_score_utils.py b/src/malco/post_process/mondo_score_utils.py
@@ -9,7 +9,7 @@
 PARTIAL_SCORE = 0.5
 
 
-@lru_cache
+@lru_cache(maxsize=4096)
 def mondo_adapter() -> OboGraphInterface:
     """
     Get the adapter for the MONDO ontology.
@@ -20,7 +20,7 @@ def mondo_adapter() -> OboGraphInterface:
     return get_adapter("sqlite:obo:mondo")
 
 
-@lru_cache()
+@lru_cache(maxsize=1024)
 def omim_mappings(term: str) -> List[str]:
     """
     Get the OMIM mappings for a term.
@@ -81,7 +81,9 @@ def score_grounded_result(prediction: str, ground_truth: str) -> float:
         # prediction is a MONDO that directly maps to a correct OMIM
         return FULL_SCORE
     mondo = mondo_adapter()
-    for mondo_descendant in mondo.descendants([prediction], predicates=[IS_A], reflexive=True):
+
+    descendants_list = mondo.descendants([prediction], predicates=[IS_A], reflexive=True)
+    for mondo_descendant in descendants_list:
         if ground_truth in omim_mappings(mondo_descendant):
             # prediction is a MONDO that maps to a correct OMIM via a descendant
             return PARTIAL_SCORE

diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -1,14 +1,22 @@
 from pathlib import Path
 
 from malco.post_process.post_process_results_format import create_standardised_results
+import os
 
-
-def post_process(raw_results_dir: Path, output_dir: Path) -> None:
+def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple) -> None:
     """
     Post-process the raw results output to standardised PhEval TSV format.
 
     Args:
         raw_results_dir (Path): Path to the raw results directory.
         output_dir (Path): Path to the output directory.
     """
-    create_standardised_results(raw_results_dir=raw_results_dir, output_dir=output_dir)
+
+    for lang in langs:
+        raw_results_lang = raw_results_dir / lang
+        output_lang = output_dir / lang
+        raw_results_lang.mkdir(exist_ok=True)
+        output_lang.mkdir(exist_ok=True)
+
+        create_standardised_results(raw_results_dir=raw_results_lang, 
+                                    output_dir=output_lang, output_file_name = "results.tsv")
diff --git a/src/malco/post_process/post_process_results_format.py b/src/malco/post_process/post_process_results_format.py
@@ -1,12 +1,60 @@
 import json
+import os
 from pathlib import Path
 from typing import List
 
+import pandas as pd
+import yaml
 from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
 from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict
 
 
+def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
+    """
+    Read the raw result file.
+
+    Args:
+        raw_result_path(Path): Path to the raw result file.
+
+    Returns:
+        dict: Contents of the raw result file.
+    """
+    with open(raw_result_path, 'r') as raw_result:
+        return list(yaml.safe_load_all(raw_result))  # Load and convert to list
+
+
+def create_standardised_results(raw_results_dir: Path, output_dir: Path,
+                                output_file_name: str) -> pd.DataFrame:
+    data = []
+    for raw_result_path in raw_results_dir.iterdir():
+        if raw_result_path.is_file():
+            all_results = read_raw_result_yaml(raw_result_path)
+
+            for this_result in all_results:
+                extracted_object = this_result.get("extracted_object")
+                if extracted_object:
+                    label = extracted_object.get('label')
+                    terms = extracted_object.get('terms')
+                    if terms:
+                        num_terms = len(terms)
+                        score = [1 / (i + 1) for i in range(num_terms)]  # score is reciprocal rank
+                        rank_list = [ i+1 for i in range(num_terms)]
+                        for term, scr, rank in zip(terms, score, rank_list):
+                            data.append({'label': label, 'term': term, 'score': scr, 'rank': rank})
+
+    # Create DataFrame
+    df = pd.DataFrame(data)
+
+    # Save DataFrame to TSV
+    output_path = output_dir / output_file_name
+    df.to_csv(output_path, sep='\t', index=False)
+
+    return df
+
+
+# these are from the template and not currently used outside of tests
+
 def read_raw_result(raw_result_path: Path) -> List[dict]:
     """
     Read the raw result file.
@@ -94,7 +142,7 @@ def extract_pheval_gene_requirements(self) -> List[PhEvalGeneResult]:
             )
         return pheval_result
 
-
+'''
 def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None:
     """
     Create PhEval gene tsv output from raw results.
@@ -117,3 +165,4 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None
             output_dir=output_dir,
             tool_result_path=raw_result_path,
         )
+'''
diff --git a/src/malco/prepare/setup_phenopackets.py b/src/malco/prepare/setup_phenopackets.py
@@ -0,0 +1,34 @@
+import zipfile
+import os 
+import requests
+
+phenopacket_zip_url="https://github.com/monarch-initiative/phenopacket-store/releases/download/0.1.11/all_phenopackets.zip"
+# TODO just point to a folder w/ ppkts
+phenopacket_dir="phenopacket-store"
+
+def setup_phenopackets(self) -> str:
+    phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
+    if os.path.exists(phenopacket_store_path):
+        print(f"{phenopacket_store_path} exists, skipping download.")
+    else:
+        print(f"{phenopacket_store_path} doesn't exist, downloading phenopackets...")
+        download_phenopackets(self, phenopacket_zip_url, phenopacket_dir)
+    return phenopacket_store_path
+
+
+def download_phenopackets(self, phenopacket_zip_url, phenopacket_dir):
+    # Ensure the directory for storing the phenopackets exists
+    phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
+    os.makedirs(phenopacket_store_path, exist_ok=True)
+
+    # Download the phenopacket release zip file
+    response = requests.get(phenopacket_zip_url)
+    zip_path = os.path.join(self.input_dir, "all_phenopackets.zip")
+    with open(zip_path, "wb") as f:
+        f.write(response.content)
+    print("Download completed.")
+
+    # Unzip the phenopacket release zip file
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(phenopacket_store_path)
+    print("Unzip completed.")
diff --git a/src/malco/run/run.py b/src/malco/run/run.py
@@ -1,11 +1,10 @@
 from pathlib import Path
-
 from malco.run.run_tool import run_tool
-from ontogpt.cli import run_multilingual_analysis
 import os
+# from ontogpt.cli import run_multilingual_analysis
 
-
-def run(testdata_dir: Path, raw_results_dir: Path) -> None:
+def run(testdata_dir: Path, raw_results_dir: Path, output_dir: Path, 
+        langs: tuple) -> None:
     """
     Run the tool to obtain the raw results.
 
@@ -14,19 +13,9 @@ def run(testdata_dir: Path, raw_results_dir: Path) -> None:
         raw_results_dir: Path to the raw results directory.
     """
     mydir = os.getcwd()
-    # TODO figure out how to run one language at a time, not like the next line
-    # lang_list = os.listdir(mydir + "prompts")
-    """
-    run_multilingual_analysis(
-        input_data_dir=mydir + "prompts/en/PMID_23993194_Family_2_Case_2-prompt",
-        output_directory=mydir + "outputdir/",
-        output=mydir + "outputdir/" + "grounded_en",  # TODO generalize lang
-        output_format="yaml",
-        model="gpt-4-turbo",
-        ext=".txt",
-    )
-    """
-    os.system(
-        f"ontogpt run-multilingual-analysis --output={mydir}/outputdir/grounded_en/results.yaml --output-format=yaml {mydir}/prompts/et/ {mydir}outputdir/"
-    )
-    # run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)
+
+    for lang in langs:
+        os.system(
+            f"ontogpt -v run-multilingual-analysis --output={output_dir}/raw_results/{lang}/results.yaml {mydir}/prompts/{lang}/ {output_dir}/raw_results/{lang}/differentials_by_file/"
+        )
+        # run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)