fix: Major restructuring

Signed-off-by: Phoevos Kalemkeris <[email protected]>
ucl-contoso-chat · Sep 2, 2024 · 6f2add5 · 6f2add5
1 parent 8a0e627
commit 6f2add5
Show file tree

Hide file tree

Showing 18 changed files with 364 additions and 400 deletions.
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
@@ -133,7 +133,7 @@ jobs:
       - name: Run RAG Evaluation and Red-teaming Test
         run: |
           cd app/backend
-          python -m evaluation evaluate --num-questions ${{ inputs.eval-num-questions }}
+          python -m evaluation run --num-questions ${{ inputs.eval-num-questions }}
 
           # Store evaluation results path
           results_path="app/backend/$(ls -d evaluation/results/experiment-* | tail -n 1)"

diff --git a/app/backend/app.py b/app/backend/app.py
@@ -101,7 +101,7 @@
 PYRIT_COMPATIBLE = sys.version_info >= (3, 10) and sys.version_info < (3, 12)
 
 if PYRIT_COMPATIBLE:
-
+    from evaluation.config import get_evaluation_config, get_red_teaming_config
     from evaluation.evaluate import run_evaluation_from_config
     from evaluation.generate import generate_test_qa_data
     from evaluation.service_setup import (
@@ -373,16 +373,40 @@ async def evaluate(auth_claims: dict[str, Any]):
             return jsonify({"message": "No application setting config part in the request", "status": "failed"}), 400
 
         input_data_file = request_files.get("input_data")
-        num_questions = int(request_form.get("num_questions", 0))
+        num_questions_str = request_form.get("num_questions", "")
+        num_questions = None if num_questions_str == "" else int(num_questions_str)
         config = json.loads(request_form.get("config", "{}"))
 
         if input_data_file is not None:
             input_data = [json.loads(line) for line in input_data_file.readlines()]
         save_jsonl(input_data, Path("./evaluation/input/input_temp.jsonl"))
         config["testdata_path"] = "input/input_temp.jsonl"
         config["results_dir"] = "results"
+        config["scorer_dir"] = "scorer_definitions"
+        config["prompt_target"] = "application"
 
-        evaluation_task = asyncio.create_task(run_evaluation_from_config(EVALUATION_DIR, config, num_questions))
+        evaluation_config = get_evaluation_config(
+            enabled=config.get("run_evaluation", True),
+            num_questions=num_questions,
+            target_url=config.get("target_url"),
+        )
+        red_teaming_config = get_red_teaming_config(
+            enabled=config.get("run_red_teaming", True),
+            scorer_dir=Path(config.get("scorer_dir")),
+            prompt_target=config.get("prompt_target"),
+            max_turns=config.get("red_teaming_max_turns"),
+            config=config,
+            target_url=config.get("target_url"),
+        )
+
+        evaluation_task = asyncio.create_task(
+            run_evaluation_from_config(
+                working_dir=EVALUATION_DIR,
+                config=config,
+                evaluation_config=evaluation_config,
+                red_teaming_config=red_teaming_config,
+            )
+        )
 
         while not evaluation_task.done():
             try:
@@ -391,17 +415,14 @@ async def evaluate(auth_claims: dict[str, Any]):
                 evaluation_task.cancel()
                 return jsonify({"error": "Connection Lost, evaluation task was cancelled"}), 500
 
-        success, result = await evaluation_task
+        report_path = await evaluation_task
 
         # if evaluation failed
-        if not success:
+        if not report_path:
             evaluation_task.cancel()
-            return jsonify({"error": result}), 500
-
-        report_path = result / "evaluation_report.pdf"
+            return jsonify({"error": "Evaluation was terminated early due to an error"}), 500
 
         try:
-            # Save the file in memory and remove the original file
             return_data = io.BytesIO()
             with open(report_path, "rb") as fo:
                 return_data.write(fo.read())
@@ -421,7 +442,7 @@ async def generate_qa(auth_claims: dict[str, Any]):
 
     num_questions = request_json.get("num_questions")
     per_source = request_json.get("per_source")
-    output_file = EVALUATION_DIR / "input" / " input_temp.jsonl"
+    output_file = EVALUATION_DIR / "input" / "input_temp.jsonl"
 
     generate_test_qa_data(
         openai_config=get_openai_config_dict(),

diff --git a/app/backend/evaluation/README.md b/app/backend/evaluation/README.md
@@ -13,10 +13,11 @@ Before using the evaluation scripts, you'll need to:
 
 - Have a live deployment of the chat application on Azure
 - Be on an Azure-authenticated shell session.
-  You can run the following command to ensure you're logged in before proceeding:
+  You can run the following commands to ensure you're logged in before proceeding:
 
   ```shell
   az login
+  azd auth login
   ```
 
 - Create a `.env` file with environment variables required by the evaluation scripts.
@@ -66,12 +67,12 @@ On the other hand, to use instances deployed on openai.com, you need to set the
 
 ```shell
 # Shell
-export OPENAICOM_ORGANIZATION="<openai-organization-name>"
-export OPENAICOM_KEY="<access-key>"
+export OPENAI_ORGANIZATION="<openai-organization-name>"
+export OPENAI_API_KEY="<access-key>"
 
 # Powershell
-$env:OPENAICOM_ORGANIZATION = "<openai-organization-name>"
-$env:OPENAICOM_KEY = "<access-key>"
+$env:OPENAI_ORGANIZATION = "<openai-organization-name>"
+$env:OPENAI_API_KEY = "<access-key>"
 ```
 
 ## Generate synthetic data for evaluation
@@ -103,16 +104,17 @@ python -m evaluation generate-answers \
   --output=evaluation/output/qa_answers.jsonl
 ```
 
-## Run evaluation
+## Run evaluation and red teaming
 
-You can run the evaluation script with the following command, specifying the path to the configuration file
-(the provided [evaluation/config.json](./config.json) will be used by default; feel free to edit it or provide your
-own. You should specify the models you want to run evaluation on in the configuration file, with more than one models implying a comparison between them. You can view the available models names with the '--help' option), as well as the number of questions considered (by default, all questions found in the input file will be consumed).
+You can run the evaluation process with the following command. The provided configuration file
+[evaluation/config.json](./config.json) will be used by default; feel free to edit it or provide your own.
+You should specify the models you want to run evaluation on in the configuration file, with more than one
+models implying a comparison between them. You can view the available models' names, as well as all options
+provided by the CLI with the `--help` option. By default, the following command will run both the metrics-based
+GPT evaluation and the red teaming approach.
 
 ```shell
-python -m evaluation evaluate \
-  --config=evaluation/config.json \
-  --num-questions=2
+python -m evaluation run
 ```
 
 ### Specify desired evaluation metrics
@@ -134,10 +136,10 @@ These metrics are calculated by sending a call to the GPT model, asking it to pr
 - [`gpt_fluency`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-fluency) measures the grammatical proficiency of a generative AI's predicted answer.
 - [`f1_score`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#traditional-machine-learning-f1-score) Measures the ratio of the number of shared words between the model generation and the ground truth answers.
 
-### GPT evaluation results
+### Evaluation results
 
-The results of each evaluation are stored in the specified results directory, in a timestamped
-`gpt_evaluation/experiment-XXXXXXXXXX` subdirectory that contains:
+The results of each evaluation run are stored in the specified results directory, in a timestamped
+`experiment-XXXXXXXXXX` subdirectory that contains:
 
 - `config.json`: The original config used for the run. This is useful for reproducing the run.
 - `eval_results.jsonl`: Each question and answer, along with the GPT metrics for each QA pair.
@@ -146,57 +148,7 @@ The results of each evaluation are stored in the specified results directory, in
 - `evaluation_results.png`: Bar charts for the pass count, pass rate and average rating of the evaluation metrics.
 - `evaluation_stat_boxplot.png`: Box charts for the evaluation results corresponding to the answer length, latency,
    and F1 score.
-- `summary.json`: The overall results, e.g. average GPT metrics.
-
-## Run red teaming evaluation
-
-When running the red teaming script, you can opt to execute it against the entire chat application (recommended) or
-just the model used as part of it.
-
-### Run the red teaming script against the entire application
-
-The default and recommended target of the red teaming attack is the entire application (specified explicitly below):
-
-```shell
-python -m evaluation red-teaming \
-  --prompt-target="application" \
-  --scorer-dir=evaluation/scorer_definitions \
-  --config=evaluation/config.json
-```
-
-`scorer-dir` is a directory that contains the customised scorer YAML files (set to the `evaluation/scorer_definitions` directory by default). Each scorer is defined by a YAML file that needs to contain the following fields:
-
-- `category`
-- `true_description`
-- `false_description`
-
-When running red teaming against the entire application, you can specify the models to be compared
-via the `models` list in [config.json](./config.json).
-You can view the available model names with the '--help' option.
-
-### Run the red teaming script against the target OpenAI model on Azure
-
-You can set the `--prompt-target` to `"azureopenai"` to target an Azure-hosted OpenAI model:
-
-```shell
-python -m evaluation red-teaming \
-  --prompt-target="azureopenai" \
-  --scorer-dir=evaluation/scorer_definitions \
-  --config=evaluation/config.json
-```
-
-### Run the red teaming script against other ML models on Azure
-
-You can set the `--prompt-target` to `"azureml"` to target a different Azure-hosted model:
-
-```shell
-python -m evaluation red-teaming \
-  --prompt-target="azureml" \
-  --scorer-dir=evaluation/scorer_definitions \
-  --config=evaluation/config.json
-```
-
-### View red teaming evaluation results
-
-The results of each red teaming experiment are stored in the specified results directory, in a timestamped
-`red_teaming/experiment-XXXXXXXXXX` subdirectory that contains a `scores.json` file with the result and a `red_teaming_results.png` with a tabular visualisation.
+- `summary.json`: The overall GPT evaluation results, e.g. average GPT metrics.
+- `scores.json`: The results of the red teaming approach.
+- `red_teaming_results.png`: A tabular visualisation of the red teaming results.
+- `evaluation_report.pdf`: A PDF report containing the graphs and aggregated metrics for the completed evaluation run.
diff --git a/app/backend/evaluation/cli.py b/app/backend/evaluation/cli.py
@@ -9,11 +9,9 @@
 from rich.logging import RichHandler
 
 from evaluation import service_setup
+from evaluation.config import get_evaluation_config, get_red_teaming_config, load_config
 from evaluation.evaluate import run_evaluation_from_config
 from evaluation.generate import generate_test_qa_answer, generate_test_qa_data
-from evaluation.red_teaming import run_red_teaming
-from evaluation.service_setup import get_models_async
-from evaluation.utils import load_config
 
 EVALUATION_DIR = Path(__file__).parent
 DEFAULT_CONFIG_PATH = EVALUATION_DIR / "config.json"
@@ -36,7 +34,7 @@
 dotenv.load_dotenv(override=True)
 
 get_model_url = os.environ.get("BACKEND_URI") + "/getmodels"
-available_models = asyncio.run(get_models_async(get_model_url))
+available_models = asyncio.run(service_setup.get_models_async(get_model_url))
 
 
 def int_or_none(raw: str) -> Optional[int]:
@@ -48,7 +46,14 @@ def str_or_none(raw: str) -> Optional[str]:
 
 
 @app.command()
-def evaluate(
+def run(
+    evaluation: bool = typer.Option(True, help="Enable or disable running the evaluation."),
+    red_teaming: bool = typer.Option(True, help="Enable or disable running the red teaming evaluation."),
+    target_url: Optional[str] = typer.Option(
+        help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
+        default=None,
+        parser=str_or_none,
+    ),
     config: Path = typer.Option(
         exists=True,
         dir_okay=False,
@@ -60,30 +65,62 @@ def evaluate(
         ),
         default=DEFAULT_CONFIG_PATH,
     ),
-    num_questions: Optional[int] = typer.Option(
-        help="Number of questions to evaluate (defaults to all if not specified).",
-        default=None,
-        parser=int_or_none,
-    ),
-    target_url: Optional[str] = typer.Option(
-        help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
-        default=None,
-        parser=str_or_none,
-    ),
     report_output: Optional[Path] = typer.Option(
-        help="Path to the PDF report output file (defaults to not generating a report).",
+        help="Path for the PDF report output file to be generated.",
         default=None,
         dir_okay=False,
         file_okay=True,
     ),
+    evaluation_num_questions: Optional[int] = typer.Option(
+        help="Number of questions to use for GPT evaluation (defaults to all if not specified).",
+        default=None,
+        parser=int_or_none,
+    ),
+    red_teaming_prompt_target: Optional[str] = typer.Option(
+        default="application",
+        help="Specify the target of the red teaming approach. Must be one of: 'application', 'azureopenai', 'azureml'.",
+    ),
+    red_teaming_scorer_dir: Path = typer.Option(
+        exists=True,
+        dir_okay=True,
+        file_okay=False,
+        help="Path to the directory where the scorer YAML files are stored.",
+        default=DEFAULT_SCORER_DIR,
+    ),
+    red_teaming_max_turns: int = typer.Option(
+        default=3, help="The maximum number of turns to apply the attack strategy for."
+    ),
 ):
-    success, result = asyncio.run(
-        run_evaluation_from_config(EVALUATION_DIR, load_config(config), num_questions, target_url, report_output)
+    config = load_config(config)
+
+    evaluation_config = get_evaluation_config(
+        enabled=evaluation,
+        num_questions=evaluation_num_questions,
+        target_url=target_url,
     )
-    if success:
-        typer.echo(f"Evaluation completed successfully, results saved to: {result.absolute().as_posix()}")
+
+    red_teaming_config = get_red_teaming_config(
+        enabled=red_teaming,
+        scorer_dir=red_teaming_scorer_dir,
+        prompt_target=red_teaming_prompt_target,
+        max_turns=red_teaming_max_turns,
+        config=config,
+        target_url=target_url,
+    )
+
+    report_path = asyncio.run(
+        run_evaluation_from_config(
+            working_dir=EVALUATION_DIR,
+            config=config,
+            evaluation_config=evaluation_config,
+            red_teaming_config=red_teaming_config,
+            report_output=report_output,
+        )
+    )
+    if report_path:
+        typer.echo(f"Evaluation completed successfully, results saved to {report_path.absolute().as_posix()}")
     else:
-        typer.echo("Evaluation failed: " + result)
+        typer.echo("Evaluation failed")
 
 
 @app.command()
@@ -131,60 +168,5 @@ def generate_answers(
     )
 
 
-@app.command()
-def red_teaming(
-    config: Path = typer.Option(
-        exists=True,
-        dir_okay=False,
-        file_okay=True,
-        help=(
-            "Path to the configuration JSON file."
-            " Edit the JSON file to specify the list of models to be evaluated/compared."
-            f" The available models are: {', '.join(available_models)}"
-        ),
-        default=DEFAULT_CONFIG_PATH,
-    ),
-    scorer_dir: Path = typer.Option(
-        exists=True,
-        dir_okay=True,
-        file_okay=False,
-        help="Path to the directory where the scorer YAML files are stored.",
-        default=DEFAULT_SCORER_DIR,
-    ),
-    prompt_target: Optional[str] = typer.Option(
-        default="application",
-        help="Specify the target for the prompt. Must be one of: 'application', 'azureopenai', 'azureml'.",
-    ),
-    target_url: Optional[str] = typer.Option(
-        help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
-        default=None,
-        parser=str_or_none,
-    ),
-    max_turns: int = typer.Option(default=3, help="The maximum number of turns to apply the attack strategy for."),
-):
-    config = load_config(config)
-    red_team = service_setup.get_openai_target()
-    if prompt_target == "application":
-        target = service_setup.get_app_target(config, target_url)
-    elif prompt_target == "azureopenai":
-        target = service_setup.get_openai_target()
-    elif prompt_target == "azureml":
-        target = service_setup.get_azure_ml_chat_target()
-    else:
-        raise ValueError(
-            f"Invalid prompt_target value: {prompt_target}. Must be one of 'application', 'azureopenai', 'azureml'"
-        )
-    asyncio.run(
-        run_red_teaming(
-            working_dir=EVALUATION_DIR,
-            scorer_dir=scorer_dir,
-            config=config,
-            red_teaming_llm=red_team,
-            prompt_target=target,
-            max_turns=max_turns,
-        )
-    )
-
-
 def cli():
     app()