From 193642c13c3a2491a21c4b8c26626057a9ef7c69 Mon Sep 17 00:00:00 2001
From: WagnerJon <84773392+WagnerJon@users.noreply.github.com>
Date: Mon, 3 Jun 2024 16:18:27 +0200
Subject: [PATCH 1/3] Language + relevance score

added option to choose language for benchmarks
added relevance score
---
 benchmark_utils.py                 | 146 +++++++
 conftest.py                        | 442 ++++++++++++++++++++
 test_biocypher_query_generation.py | 630 +++++++++++++++++++++++++++++
 3 files changed, 1218 insertions(+)
 create mode 100644 benchmark_utils.py
 create mode 100644 conftest.py
 create mode 100644 test_biocypher_query_generation.py

diff --git a/benchmark_utils.py b/benchmark_utils.py
new file mode 100644
index 00000000..b868e9c0
--- /dev/null
+++ b/benchmark_utils.py
@@ -0,0 +1,146 @@
+from datetime import datetime
+
+import pytest
+import importlib_metadata
+
+import pandas as pd
+
+
+def benchmark_already_executed(
+    model_name: str,
+    task: str,
+    md5_hash: str,
+) -> bool:
+    """
+
+    Checks if the benchmark task and subtask test case for the model_name have
+    already been executed.
+
+    Args:
+        model_name (str): The model name, e.g. "gpt-3.5-turbo"
+
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+        md5_hash (str): The md5 hash of the test case, e.g.,
+            "72434e7a340a3f6dd047b944988491b7". It is created from the
+            dictionary representation of the test case.
+
+    Returns:
+
+        bool: True if the benchmark case for the model_name has already been
+            run, False otherwise
+    """
+    task_results = return_or_create_result_file(task)
+
+    if task_results.empty:
+        return False
+
+    run = (
+        task_results[
+            (task_results["model_name"] == model_name)
+            & (task_results["md5_hash"] == md5_hash)
+        ].shape[0]
+        > 0
+    )
+
+    return run
+
+
+def skip_if_already_run(
+    model_name: str,
+    task: str,
+    md5_hash: str,
+) -> None:
+    """Helper function to check if the test case is already executed.
+
+    Args:
+        model_name (str): The model name, e.g. "gpt-3.5-turbo"
+
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+        md5_hash (str): The md5 hash of the test case, e.g.,
+            "72434e7a340a3f6dd047b944988491b7". It is created from the
+            dictionary representation of the test case.
+    """
+    if benchmark_already_executed(model_name, task, md5_hash):
+        pytest.skip(
+            f"Benchmark for {task} with hash {md5_hash} with {model_name} already executed"
+        )
+
+
+def return_or_create_result_file(
+    task: str,
+):
+    """
+    Returns the result file for the task or creates it if it does not exist.
+
+    Args:
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+    Returns:
+        pd.DataFrame: The result file for the task
+    """
+    file_path = get_result_file_path(task)
+    try:
+        results = pd.read_csv(file_path, header=0)
+    except (pd.errors.EmptyDataError, FileNotFoundError):
+        results = pd.DataFrame(
+            columns=[
+                "model_name",
+                "subtask",
+                "score",
+                "relevance_score",
+                "iterations",
+                "md5_hash",
+                "datetime",
+                "biochatter_version",
+            ]
+        )
+        results.to_csv(file_path, index=False)
+    return results
+
+
+def write_results_to_file(
+    model_name: str,
+    subtask: str,
+    score: str,
+    relevance_score: str,
+    iterations: str,
+    md5_hash: str,
+    file_path: str,
+):
+    """Writes the benchmark results for the subtask to the result file.
+
+    Args:
+        model_name (str): The model name, e.g. "gpt-3.5-turbo"
+        subtask (str): The benchmark subtask test case, e.g. "entities"
+        score (str): The benchmark score, e.g. "5"
+        relevance_score (str): Relevance score determined by LLM, e.g. "8"
+        iterations (str): The number of iterations, e.g. "7"
+        md5_hash (str): The md5 hash of the test case
+        file_path (str): The path to the result file
+    """
+    results = pd.read_csv(file_path, header=0)
+    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    bc_version = importlib_metadata.version("biochatter")
+    new_row = pd.DataFrame(
+        [[model_name, subtask, score, relevance_score, iterations, md5_hash, now, bc_version]],
+        columns=results.columns,
+    )
+    results = pd.concat([results, new_row], ignore_index=True).sort_values(
+        by=["model_name", "subtask"]
+    )
+    results.to_csv(file_path, index=False)
+
+
+# TODO should we use SQLite? An online database (REDIS)?
+def get_result_file_path(file_name: str) -> str:
+    """Returns the path to the result file.
+
+    Args:
+        file_name (str): The name of the result file
+
+    Returns:
+        str: The path to the result file
+    """
+    return f"benchmark/results/{file_name}.csv"
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 00000000..e88bcc79
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,442 @@
+import os
+
+from xinference.client import Client
+import pytest
+import requests
+
+import numpy as np
+import pandas as pd
+
+from biochatter.prompts import BioCypherPromptEngine
+from benchmark.load_dataset import get_benchmark_dataset
+from biochatter.llm_connect import GptConversation, XinferenceConversation
+from .benchmark_utils import benchmark_already_executed
+
+# how often should each benchmark be run?
+N_ITERATIONS = 1
+
+# which dataset should be used for benchmarking?
+BENCHMARK_DATASET = get_benchmark_dataset()
+
+# which models should be benchmarked?
+OPENAI_MODEL_NAMES = [
+    "gpt-3.5-turbo-0613",
+ #   "gpt-3.5-turbo-0125",
+ #   "gpt-4-0613",
+ #   "gpt-4-0125-preview",
+ #   "gpt-4o-2024-05-13",
+]
+
+XINFERENCE_MODELS = {
+    "llama-2-chat": {
+        "model_size_in_billions": [
+            #7,
+            #13,
+            #70,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            #"Q2_K",
+            # "Q3_K_S",
+            #"Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            #"Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            #"Q5_K_M",
+            # "Q6_K",
+            # "Q8_0",
+        ],
+    },
+    "code-llama-instruct": {
+        "model_size_in_billions": [
+            #7,
+            # 13,
+            # 34,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            # "Q2_K",
+            # "Q3_K_L",
+            # "Q3_K_M",
+            # "Q3_K_S",
+            # "Q4_0",
+            #"Q4_K_M",
+            # "Q4_K_S",
+            # "Q5_0",
+            # "Q5_K_M",
+            # "Q5_K_S",
+            # "Q6_K",
+            # "Q8_0",
+        ],
+    },
+    "mixtral-instruct-v0.1": {
+        "model_size_in_billions": [
+            #"46_7",
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            #"Q2_K",
+            #"Q3_K_M",
+            # "Q4_0",
+            #"Q4_K_M",
+            # "Q5_0",
+            #"Q5_K_M",
+            #"Q6_K",
+            #"Q8_0",
+        ],
+    },
+    "openhermes-2.5": {
+        "model_size_in_billions": [
+            #7,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            #"Q2_K",
+            # "Q3_K_S",
+            #"Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            #"Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            #"Q5_K_M",
+            #"Q6_K",
+            #"Q8_0",
+        ],
+    },
+    "chatglm3": {
+        "model_size_in_billions": [
+            #6,
+        ],
+        "model_format": "ggmlv3",
+        "quantization": [
+            #"q4_0",
+        ],
+    },
+    "mistral-instruct-v0.2": {
+        "model_size_in_billions": [
+            #7,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            #"Q2_K",
+            # "Q3_K_S",
+            #"Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            #"Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            #"Q5_K_M",
+            #"Q6_K",
+            #"Q8_0",
+        ],
+    },
+    # "gemma-it": {
+    #     "model_size_in_billions": [
+    #         2,
+    #         7,
+    #     ],
+    #     "model_format": "pytorch",
+    #     "quantization": [
+    #         "none",
+    #         "4-bit",
+    #         "8-bit",
+    #     ],
+    # },
+    "llama-3-instruct": {
+        "model_size_in_billions": [
+            #8,
+            # 70,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            # 8B model quantisations
+            # "IQ3_M",
+            #"Q4_K_M",
+            #"Q5_K_M",
+            #"Q6_K",
+            #"Q8_0",
+            # 70B model quantisations
+            # "IQ1_M",
+            # "IQ2_XS",
+            # "Q4_K_M",
+        ],
+    },
+}
+
+# create concrete benchmark list by concatenating all combinations of model
+# names, model sizes and quantizations
+XINFERENCE_MODEL_NAMES = [
+    f"{model_name}:{model_size}:{model_format}:{quantization}"
+    for model_name in XINFERENCE_MODELS.keys()
+    for model_size in XINFERENCE_MODELS[model_name]["model_size_in_billions"]
+    for model_format in [XINFERENCE_MODELS[model_name]["model_format"]]
+    for quantization in XINFERENCE_MODELS[model_name]["quantization"]
+]
+
+BENCHMARKED_MODELS = OPENAI_MODEL_NAMES + XINFERENCE_MODEL_NAMES
+BENCHMARKED_MODELS.sort()
+
+# Xinference IP and port
+BENCHMARK_URL = "http://localhost:9997"
+
+
+def pytest_collection_modifyitems(items):
+    """
+    Pytest hook function to modify the collected test items.
+    Called once after collection has been performed.
+
+    Used here to order items by their `callspec.id` (which starts with the
+    model name and configuration) to ensure running all tests for one model
+    before moving to the next model.
+    """
+
+    items.sort(
+        key=lambda item: (item.callspec.id if hasattr(item, "callspec") else "")
+    )
+
+    # can we skip here the tests (model x hash) that have already been executed?
+
+
+# parameterise tests to run for each model
+@pytest.fixture(params=BENCHMARKED_MODELS)
+def model_name(request):
+    return request.param
+
+
+@pytest.fixture
+def multiple_testing(request):
+    def run_multiple_times(test_func, *args, **kwargs):
+        scores = []
+        relevance_scores = []
+        for _ in range(N_ITERATIONS):
+            result_tuple, relevance_score = test_func(*args, **kwargs) #ignores relevance score
+            score, max = result_tuple
+            scores.append(score)
+            relevance_scores.append(int(relevance_score))
+        mean_score = sum(scores) / N_ITERATIONS
+        mean_rel_score = sum(relevance_scores) / N_ITERATIONS
+        sd_score = np.std(scores)
+        sd_relevance_score = np.std(relevance_scores)        # TODO return standard deviation with score
+        return (mean_score, max, mean_rel_score, N_ITERATIONS)
+
+    return run_multiple_times
+
+
+def calculate_bool_vector_score(vector: list[bool]) -> tuple[int, int]:
+    score = sum(vector)
+    max = len(vector)
+    return (score, max)
+
+
+@pytest.fixture
+def prompt_engine(request, model_name):
+    """
+    Generates a constructor for the prompt engine for the current model name.
+    """
+
+    def setup_prompt_engine(kg_schema_dict):
+        return BioCypherPromptEngine(
+            schema_config_or_info_dict=kg_schema_dict,
+            model_name=model_name,
+        )
+
+    return setup_prompt_engine
+
+
+@pytest.fixture
+def conversation(request, model_name):
+    """
+    Decides whether to run the test or skip due to the test having been run
+    before. If not skipped, will create a conversation object for interfacing
+    with the model.
+    """
+    test_name = request.node.originalname.replace("test_", "")
+    subtask = "?"  # TODO can we get the subtask here?
+    if benchmark_already_executed(model_name, test_name, subtask):
+        pass
+        # pytest.skip(
+        #     f"benchmark {test_name}: {subtask} with {model_name} already executed"
+        # )
+
+    if model_name in OPENAI_MODEL_NAMES:
+        conversation = GptConversation(
+            model_name=model_name,
+            prompts={},
+            correct=False,
+        )
+        conversation.set_api_key(
+            os.getenv("OPENAI_API_KEY"), user="benchmark_user"
+        )
+    elif model_name in XINFERENCE_MODEL_NAMES:
+        (
+            _model_name,
+            _model_size,
+            _model_format,
+            _model_quantization,
+        ) = model_name.split(":")
+        if not "_" in _model_size:
+            _model_size = int(_model_size)
+
+        # get running models
+        try:
+            client = Client(base_url=BENCHMARK_URL)
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError(
+                f"Could not connect to Xinference server at {BENCHMARK_URL}. "
+                "Please make sure that the server is running."
+            )
+
+        # if exact model already running, return conversation
+        running_models = client.list_models()
+        if running_models:
+            for running_model in running_models:
+                if (
+                    running_models[running_model]["model_name"] == _model_name
+                    and running_models[running_model]["model_size_in_billions"]
+                    == _model_size
+                    and running_models[running_model]["quantization"]
+                    == _model_quantization
+                ):
+                    conversation = XinferenceConversation(
+                        base_url=BENCHMARK_URL,
+                        model_name=_model_name,
+                        prompts={},
+                        correct=False,
+                    )
+                    return conversation
+
+        # else, terminate all running models
+        for running_model in running_models:
+            client.terminate_model(running_model)
+
+        # and launch model to be tested
+        client.launch_model(
+            model_name=_model_name,
+            model_size_in_billions=_model_size,
+            model_format=_model_format,
+            quantization=_model_quantization,
+        )
+
+        # return conversation
+        conversation = XinferenceConversation(
+            base_url=BENCHMARK_URL,
+            model_name=_model_name,
+            prompts={},
+            correct=False,
+        )
+
+    return conversation
+
+
+@pytest.fixture
+def evaluation_conversation():
+    conversation = GptConversation(
+        model_name="gpt-3.5-turbo",
+        prompts={},
+        correct=False,
+    )
+    conversation.set_api_key(os.getenv("OPENAI_API_KEY"), user="benchmark_user")
+    return conversation
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--run-all",
+        action="store_true",
+        default=False,
+        help="Run all benchmark tests from scratch",
+    )
+
+
+@pytest.fixture(autouse=True, scope="session")
+def delete_results_csv_file_content(request):
+    """
+    If --run-all is set, the former benchmark data are deleted and all
+    benchmarks are executed again.
+    """
+    if request.config.getoption("--run-all"):
+        RESULT_FILES = [
+            f"benchmark/results/{file}"
+            for file in os.listdir("benchmark/results")
+            if file.endswith(".csv")
+        ]
+        for f in RESULT_FILES:
+            if os.path.exists(f):
+                old_df = pd.read_csv(f, header=0)
+                empty_df = pd.DataFrame(columns=old_df.columns)
+                empty_df.to_csv(f, index=False)
+
+
+@pytest.fixture(scope="session")
+def result_files():
+    RESULT_FILES = [
+        f"benchmark/results/{file}"
+        for file in os.listdir("benchmark/results")
+        if file.endswith(".csv")
+    ]
+    result_files = {}
+    result_columns = [
+        "model_name",
+        "subtask",
+        "score",
+        "iterations",
+        "md5_hash",
+    ]
+    for file in RESULT_FILES:
+        try:
+            result_file = pd.read_csv(file, header=0)
+        except (pd.errors.EmptyDataError, FileNotFoundError):
+            result_file = pd.DataFrame(
+                columns=result_columns,
+            )
+            result_file.to_csv(file, index=False)
+
+        if not np.array_equal(
+            result_file.columns,
+            result_columns,
+        ):
+            result_file.columns = result_columns
+
+        result_files[file] = result_file
+
+    return result_files
+
+
+def pytest_generate_tests(metafunc):
+    """
+    Pytest hook function to generate test cases.
+    Called once for each test case in the benchmark test collection.
+    If fixture is part of test declaration, the test is parametrized.
+    """
+    # Load the data file
+    data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
+
+    # Parametrize the fixtures with the collected rows
+    if "test_data_biocypher_query_generation" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "test_data_biocypher_query_generation",
+            data_file["biocypher_query_generation"],
+        )
+    if "test_data_rag_interpretation" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "test_data_rag_interpretation",
+            data_file["rag_interpretation"],
+        )
+    if "test_data_text_extraction" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "test_data_text_extraction",
+            data_file["text_extraction"],
+        )
+
+
+@pytest.fixture
+def kg_schemas():
+    data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
+    return data_file["kg_schemas"]
diff --git a/test_biocypher_query_generation.py b/test_biocypher_query_generation.py
new file mode 100644
index 00000000..51c45593
--- /dev/null
+++ b/test_biocypher_query_generation.py
@@ -0,0 +1,630 @@
+import re
+import json
+import inspect
+
+import pytest
+
+from biochatter.prompts import BioCypherPromptEngine
+from .conftest import calculate_bool_vector_score
+from .benchmark_utils import (
+    skip_if_already_run,
+    get_result_file_path,
+    write_results_to_file,
+)
+
+
+def test_naive_query_generation_using_schema(
+    model_name,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    schema = kg_schemas[yaml_data["input"]["kg_schema"]]
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+
+        conversation.append_system_message(
+            "You are a database expert. Please write a "+yaml_data["input"]["language"]+" query to "
+            "retrieve information for the user. The schema of the graph is "
+            "defined as follows: "
+        )
+        conversation.append_system_message(json.dumps(schema, indent=2))
+        conversation.append_system_message(
+            "Only return the query, nothing else."
+        )
+
+        query, _, _ = conversation.query(yaml_data["input"]["prompt"])
+
+        score = []
+        for expected_part_of_query in yaml_data["expected"]["parts_of_query"]:
+            if isinstance(expected_part_of_query, tuple):
+                score.append(
+                    expected_part_of_query[0] in query
+                    or expected_part_of_query[1] in query
+                )
+            else:
+                score.append(
+                    (re.search(expected_part_of_query, query) is not None)
+                )
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+def get_prompt_engine(
+    kg_schema_dict: dict,
+    create_prompt_engine,
+) -> BioCypherPromptEngine:
+    """Helper function to create the prompt engine for the test.
+
+    Args:
+        kg_schema_dict (dict): The KG schema
+        create_prompt_engine: The function to create the BioCypherPromptEngine
+
+    Returns:
+        BioCypherPromptEngine: The prompt engine for the test
+    """
+    return create_prompt_engine(kg_schema_dict=kg_schema_dict)
+
+
+def test_entity_selection(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        success = prompt_engine._select_entities(
+            question=yaml_data["input"]["prompt"],
+            conversation=conversation,
+        )
+        assert success
+
+        score = []
+        for expected_entity in yaml_data["expected"]["entities"]:
+            score.append(expected_entity in prompt_engine.selected_entities)
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+def test_relationship_selection(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    if not yaml_data["expected"]["relationships"]:
+        pytest.skip("No relationships to test")
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+
+    prompt_engine.question = yaml_data["input"]["prompt"]
+    prompt_engine.selected_entities = yaml_data["expected"]["entities"]
+
+    # TODO: more generic, for nested structures
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        success = prompt_engine._select_relationships(conversation=conversation)
+        assert success
+
+        score = []
+        for expected_relationship_label_key in yaml_data["expected"][
+            "relationship_labels"
+        ].keys():
+            score.append(
+                expected_relationship_label_key
+                in prompt_engine.selected_relationship_labels.keys()
+            )
+
+            for expected_relationship_label_value in yaml_data["expected"][
+                "relationship_labels"
+            ][expected_relationship_label_key]:
+                try:
+                    score.append(
+                        expected_relationship_label_value
+                        in prompt_engine.selected_relationship_labels[
+                            expected_relationship_label_key
+                        ]
+                    )
+                except KeyError:
+                    score.append(False)
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+def test_property_selection(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+
+    prompt_engine.question = yaml_data["input"]["prompt"]
+    prompt_engine.selected_entities = yaml_data["expected"]["entities"]
+    prompt_engine.selected_relationships = yaml_data["expected"][
+        "relationships"
+    ]
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        success = prompt_engine._select_properties(conversation=conversation)
+
+        if success:
+            score = []
+            for expected_property_key in yaml_data["expected"][
+                "properties"
+            ].keys():
+                try:
+                    score.append(
+                        expected_property_key
+                        in prompt_engine.selected_properties.keys()
+                    )
+                except KeyError:
+                    score.append(False)
+
+                for expected_property_value in yaml_data["expected"][
+                    "properties"
+                ][expected_property_key]:
+                    try:
+                        score.append(
+                            expected_property_value
+                            in prompt_engine.selected_properties[
+                                expected_property_key
+                            ]
+                        )
+                    except KeyError:
+                        score.append(False)
+        else:
+            total_properties = len(
+                yaml_data["expected"]["properties"].keys()
+            ) + sum(
+                len(v) for v in yaml_data["expected"]["properties"].values()
+            )
+            score = [False] * total_properties
+
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+def test_query_generation(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        query = prompt_engine._generate_query(
+            question=yaml_data["input"]["prompt"],
+            entities=yaml_data["expected"]["entities"],
+            relationships=yaml_data["expected"]["relationship_labels"],
+            properties=yaml_data["expected"]["properties"],
+            query_language=yaml_data["input"]["language"],
+            conversation=conversation,
+        )
+
+        score = []
+        for expected_part_of_query in yaml_data["expected"]["parts_of_query"]:
+            if isinstance(expected_part_of_query, tuple):
+                score.append(
+                    expected_part_of_query[0] in query
+                    or expected_part_of_query[1] in query
+                )
+            else:
+                score.append(
+                    (re.search(expected_part_of_query, query) is not None)
+                )
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+def test_end_to_end_query_generation(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+    
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        global relevance_score
+        relevance_score = "n"  # Default value in case of ValueError
+        try:
+            query = prompt_engine.generate_query(
+                question=yaml_data["input"]["prompt"],
+                query_language=yaml_data["input"]["language"],
+            )
+            returned_query = query #saves returned query for score calculation
+            conversation.reset()
+            conversation.append_system_message("You are an expert in rating "+yaml_data["input"]["language"]+" queries and should "
+            "rate how well a certain query can retain the requested information. The query is called "
+            "query: and the requested information is called prompt:. The layout of the database is as follows: "+yaml_data["input"]["kg_schema"])
+            conversation.append_system_message("Please only reply with a number from 0 to 10 for the Rating.")
+            query, _, _ = conversation.query("Please rate how well the query can retain the requested information."
+            "prompt: "+yaml_data["input"]["prompt"]+""
+            "query: "+returned_query+"")
+            relevance_score = query # Set the relevance score if no error occurs
+            score = []
+            for expected_part_of_query in yaml_data["expected"][
+                "parts_of_query"
+            ]:
+                if isinstance(expected_part_of_query, tuple):
+                    score.append(
+                        expected_part_of_query[0] in returned_query
+                        or expected_part_of_query[1] in returned_query
+                    )
+                else:
+                    score.append(
+                        (re.search(expected_part_of_query, returned_query) is not None)
+                    )
+        except ValueError as e:
+            score = [False for _ in yaml_data["expected"]["parts_of_query"]]
+
+
+        return calculate_bool_vector_score(score),relevance_score
+        
+
+    mean_score, max, medium_relevance_score, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{relevance_score}/10",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+def test_end_to_end_query_generation_with_reassurance(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+    
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        global relevance_score
+        relevance_score = "n"  # Default value in case of ValueError
+        try:
+            query = prompt_engine.generate_query(
+                question=yaml_data["input"]["prompt"],
+                query_language=yaml_data["input"]["language"],
+            )
+            returned_query = query #saves returned query for score calculation
+            conversation.append_system_message("Generated Query: "+returned_query)
+            query, _, _ = conversation.query("Are you sure the query is correct? Please try again and only return the query.")
+            conversation.reset()
+            conversation.append_system_message("You are an expert in rating "+yaml_data["input"]["language"]+" queries and should "
+            "rate how well a certain query can retain the requested information. The query is called "
+            "query: and the requested information is called prompt:. The layout of the database is as follows: "+yaml_data["input"]["kg_schema"])
+            conversation.append_system_message("Please only reply with a number from 0 to 10 for the Rating.")
+            query, _, _ = conversation.query("Please rate how well the query can retain the requested information."
+            "prompt: "+yaml_data["input"]["prompt"]+""
+            "query: "+returned_query+"")
+            relevance_score = query # Set the relevance score if no error occurs
+            score = []
+            for expected_part_of_query in yaml_data["expected"][
+                "parts_of_query"
+            ]:
+                if isinstance(expected_part_of_query, tuple):
+                    score.append(
+                        expected_part_of_query[0] in returned_query
+                        or expected_part_of_query[1] in returned_query
+                    )
+                else:
+                    score.append(
+                        (re.search(expected_part_of_query, returned_query) is not None)
+                    )
+        except ValueError as e:
+            score = [False for _ in yaml_data["expected"]["parts_of_query"]]
+
+
+        return calculate_bool_vector_score(score)
+        
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{relevance_score}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+######
+# test hallucination: are all properties available in the KG schema?
+# in selected properties, also in the actual property used in the query
+######
+
+
+def map_entities_to_labels(entity_list):
+    entity_mapping = {}
+    for entity in entity_list:
+        match = re.match(r"(\w+):(\w+)", entity)
+        if match:
+            label, entity_type = match.groups()
+            entity_mapping[label] = entity_type
+
+    return entity_mapping
+
+
+def map_dot_properties_to_labels(property_list):
+    property_mapping = {}
+    for property in property_list:
+        match = re.match(r"(\w+)\.(\w+)", property)
+        if match:
+            label, property_type = match.groups()
+            property_mapping[label] = property_type
+
+    return property_mapping
+
+
+def map_bracket_properties_to_labels(property_list):
+    property_mapping = {}
+    for property in property_list:
+        match = re.search(r"\((\w+):\w+ \{(\w+):", property)
+        if match:
+            label, property_type = match.groups()
+            property_mapping[label] = property_type
+
+    return property_mapping
+
+
+def join_dictionaries(dict1: dict, dict2: dict) -> dict:
+    result_dict = {}
+    for key in dict1:
+        if key in dict2:
+            result_dict[dict1[key]] = dict2[key]
+
+    return result_dict
+
+
+def get_used_property_from_query(query):
+    property_mapping = dict()
+
+    # first get all properties used in 'dot' format
+
+    property_regex_dot = r"[a-zA-Z]+\.\S+ |[a-zA-Z]+\..+$"
+    used_properties = re.findall(property_regex_dot, query)
+    used_properties = [i.strip() for i in used_properties]
+    # map variable name to properties used
+    property_mapping_add = map_dot_properties_to_labels(used_properties)
+    property_mapping.update(property_mapping_add)
+
+    # get properties used in curly brackets
+    if "{" in query:
+        property_regex_bracket = r"\(\w+:\w+ \{\w+: "
+        used_properties = re.findall(property_regex_bracket, query)
+        used_properties = [i.strip() for i in used_properties]
+        # map variable name to properties used
+        property_mapping_add = map_bracket_properties_to_labels(used_properties)
+        property_mapping.update(property_mapping_add)
+
+    # get all entities or relationships involved in the query
+    entity_regex = r"[a-zA-Z]+:\w+"
+    used_entities = re.findall(entity_regex, query)
+    used_entities = [i.strip() for i in used_entities]
+
+    # map variable name to entity or relationship labels
+    entity_mapping = map_entities_to_labels(used_entities)
+
+    # get all the entity and respective properties used in the cypher query
+    used_entity_property = join_dictionaries(entity_mapping, property_mapping)
+
+    return entity_mapping, property_mapping, used_entity_property
+
+
+def test_property_exists(
+    model_name,
+    prompt_engine,
+    test_data_biocypher_query_generation,
+    kg_schemas,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_biocypher_query_generation
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+    prompt_engine = get_prompt_engine(
+        kg_schemas[yaml_data["input"]["kg_schema"]], prompt_engine
+    )
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        query = prompt_engine._generate_query(
+            question=yaml_data["input"]["prompt"],
+            entities=yaml_data["expected"]["entities"],
+            relationships=yaml_data["expected"]["relationship_labels"],
+            properties=yaml_data["expected"]["properties"],
+            query_language="Cypher",
+            conversation=conversation,
+        )
+
+        score = []
+
+        (
+            entity_mapping,
+            property_mapping,
+            used_entity_property,
+        ) = get_used_property_from_query(query)
+
+        for entity, property in used_entity_property.items():
+            if (
+                entity in prompt_engine.entities.keys()
+                and "properties" in prompt_engine.entities[entity]
+            ):
+                # check property used is in available properties for entities
+                avail_property_entity = list(
+                    prompt_engine.entities[entity]["properties"].keys()
+                )
+                score.append(property in avail_property_entity)
+            elif (
+                entity in prompt_engine.relationships.keys()
+                and "properties" in prompt_engine.relationships[entity]
+            ):
+                # check property used is in available properties for relationships
+                avail_property_entity = list(
+                    prompt_engine.relationships[entity]["properties"].keys()
+                )
+                score.append(property in avail_property_entity)
+            else:
+                # no properties of the entity or relationship exist, simply made up
+                score.append(False)
+
+        # if score is shorter than the least expected number of properties, add
+        # False values until the length is reached
+        score += [False] * (len(yaml_data["expected"]["entities"]) - len(score))
+        return calculate_bool_vector_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        prompt_engine.model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
+
+
+@pytest.mark.skip(reason="Helper function for testing regex patterns")
+def test_regex(test_data_biocypher_query_generation):
+    yaml_data = test_data_biocypher_query_generation
+    query = 'MATCH (g:Gene)-[:GENE_EXPRESSED_IN_CELL_TYPE]->(c:CellType) WHERE c.cell_type_name = "fibroblast" RETURN g.id, g.name, c.cell_type_name, c.expression_level ORDER BY c.expression_level DESC'
+    score = []
+    for expected_part_of_query in yaml_data["expected"]["parts_of_query"]:
+        if isinstance(expected_part_of_query, tuple):
+            score.append(
+                expected_part_of_query[0] in query
+                or expected_part_of_query[1] in query
+            )
+        else:
+            score.append((re.search(expected_part_of_query, query) is not None))
+
+        assert True

From d9c092d6930874f939edf6f3254652d6d554c572 Mon Sep 17 00:00:00 2001
From: WagnerJon <84773392+WagnerJon@users.noreply.github.com>
Date: Mon, 3 Jun 2024 16:24:59 +0200
Subject: [PATCH 2/3] added language selection

- added additional language option
---
 benchmark_data.yaml | 689 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 689 insertions(+)
 create mode 100644 benchmark_data.yaml

diff --git a/benchmark_data.yaml b/benchmark_data.yaml
new file mode 100644
index 00000000..2825c1f2
--- /dev/null
+++ b/benchmark_data.yaml
@@ -0,0 +1,689 @@
+# Top-level keys: benchmark modules
+# Values: list of dictionaries, each containing a test case
+#
+# Test case keys:
+# - input (for creating the test)
+# - expected (for asserting ourcomes and generating a score)
+# - case (for categorizing the test case)
+#
+# If any input is a dictionary itself, it will be expanded into separate test
+# cases, using the top-level key to create a concatenated test case purpose.
+
+biocypher_query_generation:
+  # test ability to create a simple cypher query
+  - case: simple
+    input:
+      kg_schema: gene_kg
+      prompt: What is the name of the disease with ICD10 code 'E10'?
+      language: Cypher
+    expected:
+      entities: ["Disease"]
+      relationships: []
+      relationship_labels: {}
+      properties:
+        Disease: ["name", "ICD10"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Disease)",
+          "WHERE [a-zA-Z]*\\.ICD10|{ICD10:",
+        ]
+  # test additional simple cypher query
+  - case: simple2
+    input:
+      kg_schema: gene_kg
+      prompt: What is the name of the gene with ID '6091'?
+      language: Cypher
+    expected:
+      entities: ["Disease"]
+      relationships: []
+      relationship_labels: {}
+      properties:
+        Gene: ["name", "INSR"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Gene)",
+          "WHERE [a-zA-Z]*\\.id|{id:",
+        ]
+  # test additional simple SQL query
+  - case: simple2_SQL
+    input:
+      kg_schema: gene_kg
+      prompt: What is the name of the gene with ID '6091'?
+      language: SQL
+    expected:
+      entities: ["Disease"]
+      relationships: []
+      relationship_labels: {}
+      properties:
+        Gene: ["name", "INSR"]
+      parts_of_query:
+        [
+          "^SELECT",
+          "FROM",
+          "([a-zA-Z]*:Gene)",
+          "WHERE [a-zA-Z]*\\.id|{id:",
+        ]
+  # test ability to create a simple cypher query
+  - case: simple_gene_expression
+    input:
+      kg_schema: gene_kg
+      prompt: "What is the expression level of the gene 'TP53' in cell type 'Hepatocyte'?"
+      language: Cypher
+    expected:
+      entities: ["Gene", "CellType"]
+      relationships: ["GeneExpressedInCellType"]
+      relationship_labels:
+        GENE_EXPRESSED_IN_CELL_TYPE:
+          source: Gene
+          target: CellType
+      properties:
+        Gene: ["name"]
+        Cell Type: ["cell_type_name"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*:CellType)",
+          "([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]",
+          "WHERE [a-zA-Z]*\\.TP53|{TP53:",
+          "\\)<-\\[([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]-|-\\[([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]->\\(([a-zA-Z]*:Cell Type)\\)",
+        ]
+  # test cypher query with single-word entities
+  - case: single_word
+    input:
+      kg_schema: gene_kg
+      prompt: Which genes are associated with mucoviscidosis?
+      language: Cypher
+    expected:
+      entities: ["Gene", "Disease"]
+      relationships: ["GeneToPhenotypeAssociation"]
+      relationship_labels:
+        PERTURBED:
+          source: Disease
+          target: ["Protein", "Gene"]
+      properties:
+        Disease: ["name", "ICD10", "DSM5"]
+        Gene: ["id", "name"]
+        GeneToPhenotypeAssociation: ["score", "source", "evidence"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*:Disease)",
+          "([a-zA-Z]*)?:PERTURBED]",
+          "[Mm]ucoviscidosis",
+          "\\(.*:Gene\\)<-\\[:PERTURBED\\]-|-\\[:PERTURBED\\]->\\(.*:Gene\\)",
+          "WHERE [a-zA-Z]*\\.name|{name:",
+        ]
+  # test cypher query with multi-word entities
+  - case: multi_word
+    input:
+      kg_schema: gene_kg
+      prompt: Which genes are expressed in fibroblasts?
+      language: Cypher
+    expected:
+      entities: ["Gene", "CellType"]
+      relationships: ["GeneExpressedInCellType"]
+      relationship_labels:
+        GENE_EXPRESSED_IN_CELL_TYPE:
+          source: Gene
+          target: CellType
+      properties:
+        CellType: cell_type_name
+        Gene: ["id", "name"]
+        GeneExpressedInCellType: expression_level
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*:CellType.*)",
+          "([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]",
+          "[Ff]ibroblast",
+          "\\(.*:Gene\\)-\\[:GENE_EXPRESSED_IN_CELL_TYPE\\]->\\(.*:CellType.*\\)|\\(.*:CellType.*\\)<-\\[:GENE_EXPRESSED_IN_CELL_TYPE\\]-\\(.*:Gene\\)",
+          "WHERE [a-zA-Z]*\\.cell_type_name|{cell_type_name:",
+        ]
+
+  # test more complex cypher query
+  - case: complex
+    input:
+      kg_schema: gene_kg
+      prompt: Which proteins are associated with the disease having ICD10 code 'E10', what are their scores, and what is the gene related to these proteins?
+      language: Cypher
+    expected:
+      entities: ["Protein", "Disease", "Gene"]
+      relationships: ["GeneToPhenotypeAssociation", "GeneToProteinAssociation"]
+      relationship_labels:
+        PERTURBED_IN:
+          source: Protein
+          target: Disease
+        GeneToProteinAssociation:
+          source: Gene
+          target: Protein
+      properties:
+        Disease: ["name", "ICD10"]
+        Protein: ["name", "score"]
+        Gene: ["name", "score"]
+        GeneToPhenotypeAssociation: ["score", "source", "evidence"]
+        GeneToProteinAssociation: ["score"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Protein)",
+          "([a-zA-Z]*:Disease)",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*)?:PERTURBED_IN]",
+          "([a-zA-Z]*)?:GeneToProteinAssociation]",
+          "WHERE [a-zA-Z]*\\.ICD10|{ICD10:",
+          "\\)<-\\[([a-zA-Z]*)?:PERTURBED_IN]-|-\\[([a-zA-Z]*)?:PERTURBED_IN]->\\(([a-zA-Z]*:Disease)",
+          "\\(([a-zA-Z]*(:Protein)?)\\)<-\\[([a-zA-Z]*)?:GeneToProteinAssociation]-|-\\[([a-zA-Z]*)?:GeneToProteinAssociation]->\\(([a-zA-Z]*(:Protein)?)\\)",
+        ]
+  # test another complex query
+  - case: complex2
+    input:
+      kg_schema: gene_kg
+      prompt: What are the cell types where the gene 'INSR' is expressed, and what are their medium and organism?
+      language: Cypher
+    expected:
+      entities: ["Gene", "CellType"]
+      relationships: ["GeneExpressedInCellType"]
+      relationship_labels:
+        GENE_EXPRESSED_IN_CELL_TYPE:
+          source: Gene
+          target: CellType
+      properties:
+        Gene: ["name"]
+        cell type: ["cell_type_name", "medium", "organism"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*:CellType)",
+          "([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]",
+          "WHERE [a-zA-Z]*\\.INSR|{INSR:",
+          "\\)<-\\[([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]-|-\\[([a-zA-Z]*)?:GENE_EXPRESSED_IN_CELL_TYPE]->\\(([a-zA-Z]*:Cell Type)\\)",
+        ]
+  # test another complex cypher query (not final)
+  - case: complex3
+    input:
+      kg_schema: gene_kg
+      prompt: Which proteins are interacting with proteins that are associated with the disease having ICD10 code 'E10', what are their scores, and what is the gene related to these proteins?
+      language: Cypher
+    expected:
+      entities: ["Protein", "Disease", "Gene"]
+      relationships: ["GeneToPhenotypeAssociation", "GeneToProteinAssociation", "PostTranslationalInteraction"]
+      relationship_labels:
+        PERTURBED_IN:
+          source: Protein
+          target: Disease
+        GeneToProteinAssociation:
+          source: Gene
+          target: Protein
+        PostTranslationalInteraction:
+          source: Protein
+          target: Protein
+      properties:
+        Disease: ["name", "ICD10"]
+        Protein: ["name", "score"]
+        Gene: ["name", "score"]
+        GeneToPhenotypeAssociation: ["score", "source", "evidence"]
+        GeneToProteinAssociation: ["score"]
+      parts_of_query:
+        [
+          "^MATCH",
+          "RETURN",
+          "([a-zA-Z]*:Protein)",
+          "([a-zA-Z]*:Disease)",
+          "([a-zA-Z]*:Gene)",
+          "([a-zA-Z]*)?:PERTURBED_IN]",
+          "([a-zA-Z]*)?:GeneToProteinAssociation]",
+          "WHERE [a-zA-Z]*\\.ICD10|{ICD10:",
+          "\\)<-\\[([a-zA-Z]*)?:PERTURBED_IN]-|-\\[([a-zA-Z]*)?:PERTURBED_IN]->\\(([a-zA-Z]*:Disease)",
+          "\\(([a-zA-Z]*(:Protein)?)\\)<-\\[([a-zA-Z]*)?:GeneToProteinAssociation]-|-\\[([a-zA-Z]*)?:GeneToProteinAssociation]->\\(([a-zA-Z]*(:Protein)?)\\)",
+        ]
+
+rag_interpretation:
+  # test simple irrelevance judgement
+  - case: explicit_relevance_no
+    input:
+      prompt: Which molecular pathways are associated with cancer?
+      system_messages:
+        simple:
+          [
+            "You will receive a text fragment to help answer the user's question. Your task is to judge this text fragment for relevance to the user's question, and return either 'yes' or 'no'! Here is the fragment: ",
+            "The earth is a globe.",
+          ]
+        more_explicit:
+          [
+            "You will receive a text fragment to help answer the user's question. Your task is to judge this text fragment for relevance to the user's question, and return either 'yes' or 'no'; only respond with one word, do not offer explanation or justification! Here is the fragment: ",
+            "The earth is a globe.",
+          ]
+        repeat_instruction:
+          [
+            "You will receive a text fragment to help answer the user's question. You should only respond with 'yes' or 'no' without additional words. Your task is to judge this text fragment for relevance to the user's question, and return either 'yes' or 'no'; only respond with one word, do not offer explanation or justification! Here is the fragment: ",
+            "The earth is a globe.",
+          ]
+    expected:
+      answer: "no"
+
+  # test simple relevance judgement
+  - case: explicit_relevance_yes
+    input:
+      prompt: Which molecular pathways are associated with cancer?
+      system_messages:
+        [
+          "You will receive a text fragment to help answer the user's question. Your task is to judge this text fragment for relevance to the user's question, and return either 'yes' or 'no'; only respond with one word, do not offer explanation or justification! Here is the fragment: ",
+          "The EGFR pathway is deregulated in a number of cancers.",
+        ]
+    expected:
+      answer: "yes"
+
+  # test capability to evaluate response to relevance question negatively
+  - case: explicit_evaluation_no
+    input:
+      prompt: "I'm sorry, but the given text fragments do not provide any relevant information about molecular pathways associated with cancer."
+      system_messages:
+        [
+          "You will receive a statement as an answer to this question: ",
+          "Which molecular pathways are associated with cancer? ",
+          "If the statement is an answer to the question, please type 'answer'. ",
+          "If the statement declines to answer to the question or apologises, giving the reason of lack of relevance of the given text fragments, please type 'decline'. ",
+          "Do not type anything except these two options. Here is the statement: ",
+        ]
+    expected:
+      answer: "decline"
+
+  # test capability to evaluate response to relevance question positively
+  - case: explicit_evaluation_yes
+    input:
+      prompt: "There are several molecular pathways that are associated with cancer, for instance TP53, BRCA1, the STAT inflammatory pathway, and the EGFR pathway."
+      system_messages:
+        [
+          "You will receive a statement as an answer to this question: ",
+          "Which molecular pathways are associated with cancer? ",
+          "If the statement is an answer to the question, please type 'answer'. ",
+          "If the statement declines to answer to the question or apologises, giving the reason of lack of relevance of the given text fragments, please type 'decline'. ",
+          "Do not type anything except these two options. Here is the statement: ",
+        ]
+    expected:
+      answer: "answer"
+
+  # test complex (implicit) irrelevance judgement
+  - case: implicit_relevance_no
+    input:
+      prompt: Which molecular pathways are associated with cancer?
+      system_messages:
+        [
+          "You will receive a number of text fragments to help answer the user's question. Your task is to use the information in these text fragments for answering the user's question, if they are relevant. Here are the fragments: ",
+          "The earth is a globe.",
+          "The Human is the most endurant mammal.",
+          "America was discovered by Columbus.",
+          "The wavelength of red light is longer than that of blue light.",
+        ]
+    expected:
+      behaviour: "decline"
+
+  # test complex (implicit) relevance judgement
+  - case: implicit_relevance_yes
+    input:
+      prompt: Which molecular pathways are associated with cancer?
+      system_messages:
+        [
+          "You will receive a number of text fragments to help answer the user's question. Your task is to use the information in these text fragments for answering the user's question, if they are relevant. Here are the fragments: ",
+          "TP53, when mutated, often is a driving mutation in cancer.",
+          "BRCA1 is an important genetic regulator of breast cancer.",
+          "The STAT inflammatory pathway is often perturbed in cancers.",
+          "The EGFR pathway can lead to excessive growth of cancer cells.",
+        ]
+    expected:
+      behaviour: "answer"
+
+text_extraction:
+  - case: source_data_extraction
+    input:
+      query:
+        entity: "What is the assayed/measured entity?"
+        intervention: "What was the intervention/controlled variable in this experiment?"
+        context: "In what kind of cell/tissue/organism/subcellular component was the experiment performed?"
+        assay: "What kind of experimental assay was used for this experiment?"
+        experiment_yes_or_no: "Does the legend describe an experiment or not?"
+        ncbi_link: "Can you link the identified genes to their NCBI gene identifiers?"
+        hypothesis: "Can you formulate the hypothesis that this experiment has tested."
+        disease: "Is there any disease term mentioned, or can be infered, in the figure legend?"
+        stats: "What was the statistical method to validate the significance of the experimental result?"
+        significance: "What is the significance level of the experiment?"
+        chemical: "Are there any chemical compounds or small molecules mentioned?"
+      caption:
+        3d_sim: "(A) 3D-SIM images of HeLa control, PINK1-/- and FBXO7-/- cell lines after AO-induced mitophagy. Cells were stained for nuclear DNA (DAPI), mitochondria (HSP60) and pUb. Zoom-ins of regions of interested are enlarged in the middle panel. 3D-surface renderings of insets are shown on the right. Scale bar = 5 µm or 1 µm. (B,C) Evaluation of 3D-SIM images from HeLa datasets. The changes in pUb volume and minimal distances between mitochondria and pUb after mitophagy-induction are plotted. Error bars depict S.D. from 8-14 measured cells per condition. Two-way ANOVA with multiple comparisons; p(****)<0.0001."
+        flow_cyto: "(B) Mean Acidic:Neutral mtKeima per-cell ratios measured by flow cytometry for HeLa cells expressing Parkin ndicating the number of hours treated with AO (Antimycin A (5 µM) and Oligomycin (10 µM)) or three hours with 25 nM BafilomycinA (BafA). Error bars depict S.D. from biological triplicate measurements from three independent clones. Two-way ANOVA with multiple comparisons; p(****)<0.0001."
+        western: "(E) Western blot of cell lysate (Lys) from 2x105 cells, 10k and 200k secreted by 20x106 of 10 tumor cell lines (EO771, TS/A, LLC1, KP, B16F10, MCA101, MB49, Raw264.7, 4T1 and MutuDC) and 2 non-tumoral fibroblast cell lines (Pfa1 and Mus Dunni), showing hybridization with antibodies against env (top) and gag (middle) viral proteins and total proteins (bottom). Gag is observed with different sizes, especially the full-length Pr65 (blue circle), and the mature cleaved p30 (blue cross) forms."
+      format:
+        entity: "List of comma separated entities. 'None' if no entity is found."
+        intervention: "List of comma separated entities. 'None' if no intervention / controlled entities are found."
+        context: "Line separated asignations following the example: cell_type: Cell_type_name1, cell_type_2 \n organism: Organism_1, organism_2 \n tissue: tissue_name \n cell_line: cell_line_name \n subcellular: subcell_component. Return None for the fields with no information in the caption."
+        assay: "List of comma separated experimental assays. 'None' if no assay is found."
+        experiment_yes_or_no: "Return 'Yes' if the legend describes an experiment, 'No' if not."
+        ncbi_link: "For each identified gene, generate a line with: 'gene_name': 'NCBI gene identifier'. Return 'None' if no genes are identified."
+        hypothesis: "One hypothesis per line as in the example: Entity 1 --> Entity 2 \n Entity 3 --> Entity 4"
+        disease: "Comma separated list with the mentioned diseases, 'None' list if no disease is mentioned."
+        stats: "Simple string. 'None' if no information is found."
+        significance: "The statistical significance level. 'None' if no information is found"
+        chemical: "List of comma separated chemicals. 'None' if no chemical compound or molecule is found."
+      system_messages:
+        simple: |-
+          You are a skillful scientist who wants to curate figure legends. You will receive a user query consisting of a figure legend and a query, that you must answer based on the figure legend. The input format will be: FIGURE CAPTION: {{figure legend}} ##\n\n## QUERY: {{query}} ##\n\n## ANSWER FORMAT: {{format}}.
+          Submit your answer EXTRICTLY in the format specified by {{format}}.
+
+        detailed_description: |-
+          You are a skillful scientist who wants to curate figure legends. You will receive a user query consisting of a figure legend and a query, that you must answer based on the figure legend. The input format will be: FIGURE CAPTION: {{figure legend}} ##\n\n## QUERY: {{query}} ##\n\n## ANSWER FORMAT: {{format}}.
+          Submit your answer EXTRICTLY in the format specified by {{format}}.
+
+          To correctly answer the query, follow the guidelines described next:
+          Experiments in cell and molecular biology involve the empirical manipulation, observation and description of biological entities. Biological and chemical entities can be entire organisms, a subset of their constituents or part of the experimental milieu.
+
+          Examples of entities:
+          - calcium, oligomycin, p53, mitochondria, liver, mus musculus, synapse, HeLa cells are entities.
+
+          Examples of no entities:
+          - The cell cycle, apoptosis, wound healing or type II diabetes are not entities.
+
+          Entities are assigned to one of seven types spanning successive levels of biological organisations. Each type is mutually exclusive. The seven types are: Small molecules, genes, proteins, cellular components, cell types and cell lines, tissues and organs, organisms and species.
+
+          Example of entity classification:
+          - ATP is a small molecule
+          - creb1 is a gene
+          - CREB1 is a protein
+          - the Golgi apparatus is a cellular component
+          - HEK293 cells is a cell line
+          - the retina is a tissue
+          - Saccharomyces cerevisiae and PhiX174 are organisms.
+
+          If an entity does not fit any of the predefined types, the undefined type is assigned.
+          In general, generic terms referring to broad classes of biological components (eg 'proteins', 'cells', 'animals') SHOULD NOT be tagged unless they refer to the object of an assay.
+
+          You should also identify experimental assays. Some examples of these are:
+          - immunoblot
+          - western blot
+          - staining
+          - gene expression
+          - flow cytometry (FACS)
+
+          Entity roles:
+
+          ​​Biological entities listed in the caption of a figure each play a different role in the experimental design: some components are altered in a controlled manner, others remain untouched by the experimenter, and some are directly or indirectly assayed to perform measurements or observations. Accordingly, the following roles are defined:
+
+          - Biological component
+          - Assayed component
+          - Controlled variable
+
+          Assayed component:
+          An assayed component is the component that is measured or observed.
+
+          Example for assayed component:
+          - The proteins detected on a Western blot are the assayed components with the exception of the loading control, if any, which is a normalizing component.
+
+          Controlled variables:
+          A controlled variable (also called perturbation, intervention, manipulation, alteration or independent variable) is a component that is experimentally altered. A controlled variable MUST be targeted and MUST be controlled. This implies that the experiment MUST involve the same experimental system across experimental groups and MUST involve a comparison between several experimental groups to test whether the controlled variable causes an effect on the assayed component.
+
+          Example for controlled variable:
+          - The function of the gene creb1 can be investigated by comparing creb1 wt (control group) to creb1-/- knockout (test group) mice; in this experiment, creb1 is the controlled variable. If, and only if, it is appropriately controlled, the purpose of such an experiment is to infer a cause-and-effect relationship.
+          - If cells are treated with different doses of the PKA inhibitor H89, H89 is tagged as the controlled variable.
+          - In a siRNA-mediated knock down experiment, the gene targeted by the siRNA is tagged as a controlled variable.
+
+          Biological component:
+          A biological component is a generic category for any experimentally relevant component which does not fit any of the other defined roles. Often it will contain the organism, the cell, or a generic treatment that is present across all conditions.
+        few_shot: |-
+          You are a skillful scientist who wants to curate figure legends. You will receive a user query consisting of a figure legend and a query, that you must answer based on the figure legend. The input format will be: FIGURE CAPTION: {{figure legend}} ##\n\n## QUERY: {{query}} ##\n\n## ANSWER FORMAT: {{format}}.
+          Submit your answer EXTRICTLY in the format specified by {{format}}. You will receive a series of examples below, showing your expected behavior. At the end of the prompt, the user will input her example. Please provide the required answers.
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: What is the assayed/measured entity? ##
+
+          ## ANSWER FORMAT: List of comma separated entities. 'None' if no entity is found.
+
+          ANSWER: CD63, CD9
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: What was the intervention/controlled variable in this experiment? ##
+
+          ## ANSWER FORMAT: List of comma separated entities. 'None' if no intervention / controlled entities are found.
+
+          ANSWER: None
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: In what kind of cell/tissue/organism/subcellular component was the experiment performed? ##
+
+          ## ANSWER FORMAT: Line separated asignations following the example: cell_type: Cell_type_name1, cell_type_2 \n organism: Organism_1, organism_2 \n tissue: tissue_name \n cell_line: cell_line_name \n subcellular: subcell_component. Return None for the fields with no information in the caption.
+
+          ANSWER:
+          cell_type: None
+          organism: None
+          tissue: None
+          cell_line: EO771-myr/palm-mCherry
+          subcellular: None
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: What kind of experimental assay was used for this experiment? ##
+
+          ## ANSWER FORMAT: List of comma separated experimental assays. 'None' if no assay is found.
+
+          ANSWER: Confocal microscopy
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: Does the legend describe an experiment or not? ##
+
+          ## ANSWER FORMAT: Yes or No.
+
+          ANSWER: Yes
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: Can you formulate the hypothesis that this experiment has tested. ##
+
+          ## ANSWER FORMAT: One hypothesis per line as in the example: Entity 1 --> Entity 2 \n Entity 3 --> Entity 4.
+
+          ANSWER:
+          EO771-myr/palm-mCherry --> CD9
+          EO771-myr/palm-mCherry --> CD63
+
+          FIGURE CAPTION: (B) Confocal microscopy of EO771-myr/palm-mCherry cells showing DAPI in blue and mCherry in red, CD9 in green and CD63 in magenta (overlay and close-ups). ##
+
+          ## QUERY: Are there any chemical compounds or small molecules mentioned? ##
+
+          ## ANSWER FORMAT: List of comma separated chemicals. 'None' if no chemical compound or molecule is found.
+
+          ANSWER: DAPI
+
+          FIGURE CAPTION: (e) Violin plots depicting lognormalized readcounts of oocyte-specific genes (Bmp15, Ddx4, Dppa3, Gdf9, Kit, Mos, Nlrp5, Zp3). (5 control animals, n=81 oocytes; 9 STZ animals, n=149 oocytes). ##
+
+          ## QUERY: Can you link the identified genes to their NCBI gene identifiers? ##
+
+          ## ANSWER FORMAT: For each identified gene, generate a line with: 'gene_name': 'NCBI gene identifier'. Return 'None' if no genes are identified.
+
+          ANSWER:
+          Bmp15: 9210
+          Ddx4: 13206
+          Dppa3: 73708
+          Gdf9: 14566
+          Kit: 16590
+          Mos: 17451
+          Nlrp5: 23968
+          Zp3: 7784
+
+    expected:
+      answer:
+        3d_sim:
+          entity: "Ub (pUb), mitochondria, HSP60"
+          intervention: "PINK1, FBXO7"
+          context: "cell_type: None \n organism: None \n tissue: None \n cell_line: HeLa \n subcellular: nucleus, mitochondria"
+          assay: "3D structured illumination microscopy (3D-SIM)"
+          experiment_yes_or_no: "Yes"
+          ncbi_link: "PINK1: 65018 \n FBXO7: 25793"
+          hypothesis: "PINK1 --> pUb \n FBXO7 --> pUb"
+          disease: "None"
+          stats: "Two-way ANOVA with multiple comparisons"
+          significance: "0.0001"
+          chemical: "DAPI"
+        flow_cyto:
+          entity: "Mean Acidic:Neutral mtKeima per-cell ratios, HeLa, mitochondria"
+          intervention: "Antimycin A, Oligomycin, BafilomycinA (BafA)"
+          context: "cell_type: None \n organism: None \n tissue: None \n cell_line: HeLa \n subcellular: mitochondria"
+          assay: "Flow cytometry"
+          experiment_yes_or_no: "Yes"
+          ncbi_link: "Parkin: 5071"
+          hypothesis: "Antimycin A --> HeLa \n Oligomycin --> HeLa Mean Acidic:Neutral mtKeima per-cell ratios \n BafilomycinA (BafA) --> HeLa Mean Acidic:Neutral mtKeima per-cell ratios"
+          disease: "None"
+          stats: "Two-way ANOVA with multiple comparisons"
+          significance: "0.0001"
+          chemical: "Antimycin A, Oligomycin, BafilomycinA "
+        western:
+          entity: "env, gag"
+          intervention: "None"
+          context: "cell_type: fibroblast \n organism: None \n tissue: connective \n cell_line: EO771, TS/A, LLC1, KP, B16F10, MCA101, MB49, Raw264.7, 4T1, MutuDC, Pfa1, Mus Dunni \n subcellular: None"
+          assay: "Western blot"
+          experiment_yes_or_no: "Yes"
+          ncbi_link: "None"
+          hypothesis: "(EO771, TS/A, LLC1, KP, B16F10, MCA101, MB49, Raw264.7, 4T1, MutuDC, Pfa1, Mus Dunni) --> env \n (EO771, TS/A, LLC1, KP, B16F10, MCA101, MB49, Raw264.7, 4T1, MutuDC, Pfa1, Mus Dunni) --> gag "
+          disease: "None mentioned, Murine leukemia"
+          stats: "None"
+          significance: "None"
+          chemical: "None"
+
+kg_schemas:
+  gene_kg:
+    cell type:
+      input_label: cell_type
+      is_relationship: false
+      preferred_id: cl
+      present_in_knowledge_graph: true
+      properties:
+        cell_type_name: str
+        medium: str
+        organism: str
+      represented_as: node
+    disease:
+      input_label: Disease
+      is_relationship: false
+      preferred_id: doid
+      present_in_knowledge_graph: true
+      properties:
+        DSM5: str
+        ICD10: str
+        name: str
+      represented_as: node
+    gene:
+      exclude_properties: accession
+      input_label:
+        - hgnc
+        - ensg
+      is_relationship: false
+      preferred_id: hgnc
+      present_in_knowledge_graph: true
+      properties:
+        id: str
+        name: str
+        taxon: int
+      represented_as: node
+    gene expressed in cell type:
+      input_label: gene_expressed_in_cell_type
+      is_a: gene to expression site association
+      is_relationship: true
+      label_as_edge: GENE_EXPRESSED_IN_CELL_TYPE
+      present_in_knowledge_graph: true
+      properties:
+        expression_level: float
+      represented_as: edge
+      source: gene
+      target: cell type
+    gene to disease association:
+      input_label: gene_phenotype
+      is_relationship: true
+      label_as_edge: PERTURBED_IN
+      present_in_knowledge_graph: true
+      properties:
+        evidence: str
+        score: float
+        source: str
+      represented_as: edge
+      source: protein
+      target: disease
+    gene to phenotype association:
+      exclude_properties: accession
+      input_label:
+        - protein_disease
+        - gene_disease
+      is_relationship: true
+      label_as_edge: PERTURBED
+      present_in_knowledge_graph: true
+      properties:
+        evidence: str
+        score: float
+        source: str
+      represented_as: edge
+      source: disease
+      target:
+        - protein
+        - gene
+    gene to protein association:
+      input_label: gene_protein
+      is_relationship: true
+      present_in_knowledge_graph: true
+      properties:
+        evidence: str
+        score: float
+        source: str
+      represented_as: edge
+      source: gene
+      target: protein
+    pathway:
+      input_label:
+        - reactome
+        - wikipathways
+      is_relationship: false
+      preferred_id:
+        - reactome
+        - wikipathways
+      present_in_knowledge_graph: false
+      represented_as: node
+    phosphorylation:
+      input_label: phosphorylation
+      is_a: post translational interaction
+      is_relationship: true
+      present_in_knowledge_graph: true
+      represented_as: edge
+      source: protein
+      target: protein
+    post translational interaction:
+      input_label: post_translational
+      is_a: pairwise molecular interaction
+      is_relationship: true
+      label_as_edge: INTERACTS_POST_TRANSLATIONAL
+      present_in_knowledge_graph: false
+      represented_as: node
+      source: protein
+      target: protein
+    protein:
+      db_collection_name: proteins
+      input_label: protein
+      is_relationship: false
+      preferred_id: uniprot
+      present_in_knowledge_graph: true
+      represented_as: node
+      properties:
+        genes: str[]
+        name: str
+        score: float
+        taxon: int

From 9184b97e0cd485385717131d88421166cdd2fbe2 Mon Sep 17 00:00:00 2001
From: WagnerJon <84773392+WagnerJon@users.noreply.github.com>
Date: Mon, 3 Jun 2024 16:26:43 +0200
Subject: [PATCH 3/3] upload results

upload of newer results
---
 end_to_end_query_generation.csv                  | 2 ++
 end_to_end_query_generation_with_reassurance.csv | 4 ++++
 2 files changed, 6 insertions(+)
 create mode 100644 end_to_end_query_generation.csv
 create mode 100644 end_to_end_query_generation_with_reassurance.csv

diff --git a/end_to_end_query_generation.csv b/end_to_end_query_generation.csv
new file mode 100644
index 00000000..23f96f31
--- /dev/null
+++ b/end_to_end_query_generation.csv
@@ -0,0 +1,2 @@
+model_name,subtask,score,relevance_score,iterations,md5_hash,datetime,biochatter_version
+gpt-3.5-turbo-0613,simple2,3.0/4,10/10,3,ec2f1b39b5ef150b5f9873f124223c67,2024-05-27 16:04:18,0.4.9
diff --git a/end_to_end_query_generation_with_reassurance.csv b/end_to_end_query_generation_with_reassurance.csv
new file mode 100644
index 00000000..24c3ca05
--- /dev/null
+++ b/end_to_end_query_generation_with_reassurance.csv
@@ -0,0 +1,4 @@
+model_name,subtask,score,relevance_score,iterations,md5_hash,datetime,biochatter_version
+gpt-3.5-turbo-0125,simple2,4.0/4,8,1,ec2f1b39b5ef150b5f9873f124223c67,2024-05-27 13:21:17,0.4.9
+gpt-3.5-turbo-0613,simple2,3.0/4,10,1,ec2f1b39b5ef150b5f9873f124223c67,2024-05-27 13:22:24,0.4.9
+gpt-4-0613,simple2,4.0/4,10,1,ec2f1b39b5ef150b5f9873f124223c67,2024-05-27 13:32:06,0.4.9