Skip to content

Commit

Permalink
fix: Major restructuring
Browse files Browse the repository at this point in the history
Signed-off-by: Phoevos Kalemkeris <[email protected]>
  • Loading branch information
phoevos committed Sep 2, 2024
1 parent 8a0e627 commit 6f2add5
Show file tree
Hide file tree
Showing 18 changed files with 364 additions and 400 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/model-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ jobs:
- name: Run RAG Evaluation and Red-teaming Test
run: |
cd app/backend
python -m evaluation evaluate --num-questions ${{ inputs.eval-num-questions }}
python -m evaluation run --num-questions ${{ inputs.eval-num-questions }}
# Store evaluation results path
results_path="app/backend/$(ls -d evaluation/results/experiment-* | tail -n 1)"
Expand Down
41 changes: 31 additions & 10 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
PYRIT_COMPATIBLE = sys.version_info >= (3, 10) and sys.version_info < (3, 12)

if PYRIT_COMPATIBLE:

from evaluation.config import get_evaluation_config, get_red_teaming_config
from evaluation.evaluate import run_evaluation_from_config
from evaluation.generate import generate_test_qa_data
from evaluation.service_setup import (
Expand Down Expand Up @@ -373,16 +373,40 @@ async def evaluate(auth_claims: dict[str, Any]):
return jsonify({"message": "No application setting config part in the request", "status": "failed"}), 400

input_data_file = request_files.get("input_data")
num_questions = int(request_form.get("num_questions", 0))
num_questions_str = request_form.get("num_questions", "")
num_questions = None if num_questions_str == "" else int(num_questions_str)
config = json.loads(request_form.get("config", "{}"))

if input_data_file is not None:
input_data = [json.loads(line) for line in input_data_file.readlines()]
save_jsonl(input_data, Path("./evaluation/input/input_temp.jsonl"))
config["testdata_path"] = "input/input_temp.jsonl"
config["results_dir"] = "results"
config["scorer_dir"] = "scorer_definitions"
config["prompt_target"] = "application"

evaluation_task = asyncio.create_task(run_evaluation_from_config(EVALUATION_DIR, config, num_questions))
evaluation_config = get_evaluation_config(
enabled=config.get("run_evaluation", True),
num_questions=num_questions,
target_url=config.get("target_url"),
)
red_teaming_config = get_red_teaming_config(
enabled=config.get("run_red_teaming", True),
scorer_dir=Path(config.get("scorer_dir")),
prompt_target=config.get("prompt_target"),
max_turns=config.get("red_teaming_max_turns"),
config=config,
target_url=config.get("target_url"),
)

evaluation_task = asyncio.create_task(
run_evaluation_from_config(
working_dir=EVALUATION_DIR,
config=config,
evaluation_config=evaluation_config,
red_teaming_config=red_teaming_config,
)
)

while not evaluation_task.done():
try:
Expand All @@ -391,17 +415,14 @@ async def evaluate(auth_claims: dict[str, Any]):
evaluation_task.cancel()
return jsonify({"error": "Connection Lost, evaluation task was cancelled"}), 500

success, result = await evaluation_task
report_path = await evaluation_task

# if evaluation failed
if not success:
if not report_path:
evaluation_task.cancel()
return jsonify({"error": result}), 500

report_path = result / "evaluation_report.pdf"
return jsonify({"error": "Evaluation was terminated early due to an error"}), 500

try:
# Save the file in memory and remove the original file
return_data = io.BytesIO()
with open(report_path, "rb") as fo:
return_data.write(fo.read())
Expand All @@ -421,7 +442,7 @@ async def generate_qa(auth_claims: dict[str, Any]):

num_questions = request_json.get("num_questions")
per_source = request_json.get("per_source")
output_file = EVALUATION_DIR / "input" / " input_temp.jsonl"
output_file = EVALUATION_DIR / "input" / "input_temp.jsonl"

generate_test_qa_data(
openai_config=get_openai_config_dict(),
Expand Down
90 changes: 21 additions & 69 deletions app/backend/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ Before using the evaluation scripts, you'll need to:

- Have a live deployment of the chat application on Azure
- Be on an Azure-authenticated shell session.
You can run the following command to ensure you're logged in before proceeding:
You can run the following commands to ensure you're logged in before proceeding:

```shell
az login
azd auth login
```

- Create a `.env` file with environment variables required by the evaluation scripts.
Expand Down Expand Up @@ -66,12 +67,12 @@ On the other hand, to use instances deployed on openai.com, you need to set the

```shell
# Shell
export OPENAICOM_ORGANIZATION="<openai-organization-name>"
export OPENAICOM_KEY="<access-key>"
export OPENAI_ORGANIZATION="<openai-organization-name>"
export OPENAI_API_KEY="<access-key>"

# Powershell
$env:OPENAICOM_ORGANIZATION = "<openai-organization-name>"
$env:OPENAICOM_KEY = "<access-key>"
$env:OPENAI_ORGANIZATION = "<openai-organization-name>"
$env:OPENAI_API_KEY = "<access-key>"
```

## Generate synthetic data for evaluation
Expand Down Expand Up @@ -103,16 +104,17 @@ python -m evaluation generate-answers \
--output=evaluation/output/qa_answers.jsonl
```

## Run evaluation
## Run evaluation and red teaming

You can run the evaluation script with the following command, specifying the path to the configuration file
(the provided [evaluation/config.json](./config.json) will be used by default; feel free to edit it or provide your
own. You should specify the models you want to run evaluation on in the configuration file, with more than one models implying a comparison between them. You can view the available models names with the '--help' option), as well as the number of questions considered (by default, all questions found in the input file will be consumed).
You can run the evaluation process with the following command. The provided configuration file
[evaluation/config.json](./config.json) will be used by default; feel free to edit it or provide your own.
You should specify the models you want to run evaluation on in the configuration file, with more than one
models implying a comparison between them. You can view the available models' names, as well as all options
provided by the CLI with the `--help` option. By default, the following command will run both the metrics-based
GPT evaluation and the red teaming approach.

```shell
python -m evaluation evaluate \
--config=evaluation/config.json \
--num-questions=2
python -m evaluation run
```

### Specify desired evaluation metrics
Expand All @@ -134,10 +136,10 @@ These metrics are calculated by sending a call to the GPT model, asking it to pr
- [`gpt_fluency`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#ai-assisted-fluency) measures the grammatical proficiency of a generative AI's predicted answer.
- [`f1_score`](https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in#traditional-machine-learning-f1-score) Measures the ratio of the number of shared words between the model generation and the ground truth answers.

### GPT evaluation results
### Evaluation results

The results of each evaluation are stored in the specified results directory, in a timestamped
`gpt_evaluation/experiment-XXXXXXXXXX` subdirectory that contains:
The results of each evaluation run are stored in the specified results directory, in a timestamped
`experiment-XXXXXXXXXX` subdirectory that contains:

- `config.json`: The original config used for the run. This is useful for reproducing the run.
- `eval_results.jsonl`: Each question and answer, along with the GPT metrics for each QA pair.
Expand All @@ -146,57 +148,7 @@ The results of each evaluation are stored in the specified results directory, in
- `evaluation_results.png`: Bar charts for the pass count, pass rate and average rating of the evaluation metrics.
- `evaluation_stat_boxplot.png`: Box charts for the evaluation results corresponding to the answer length, latency,
and F1 score.
- `summary.json`: The overall results, e.g. average GPT metrics.

## Run red teaming evaluation

When running the red teaming script, you can opt to execute it against the entire chat application (recommended) or
just the model used as part of it.

### Run the red teaming script against the entire application

The default and recommended target of the red teaming attack is the entire application (specified explicitly below):

```shell
python -m evaluation red-teaming \
--prompt-target="application" \
--scorer-dir=evaluation/scorer_definitions \
--config=evaluation/config.json
```

`scorer-dir` is a directory that contains the customised scorer YAML files (set to the `evaluation/scorer_definitions` directory by default). Each scorer is defined by a YAML file that needs to contain the following fields:

- `category`
- `true_description`
- `false_description`

When running red teaming against the entire application, you can specify the models to be compared
via the `models` list in [config.json](./config.json).
You can view the available model names with the '--help' option.

### Run the red teaming script against the target OpenAI model on Azure

You can set the `--prompt-target` to `"azureopenai"` to target an Azure-hosted OpenAI model:

```shell
python -m evaluation red-teaming \
--prompt-target="azureopenai" \
--scorer-dir=evaluation/scorer_definitions \
--config=evaluation/config.json
```

### Run the red teaming script against other ML models on Azure

You can set the `--prompt-target` to `"azureml"` to target a different Azure-hosted model:

```shell
python -m evaluation red-teaming \
--prompt-target="azureml" \
--scorer-dir=evaluation/scorer_definitions \
--config=evaluation/config.json
```

### View red teaming evaluation results

The results of each red teaming experiment are stored in the specified results directory, in a timestamped
`red_teaming/experiment-XXXXXXXXXX` subdirectory that contains a `scores.json` file with the result and a `red_teaming_results.png` with a tabular visualisation.
- `summary.json`: The overall GPT evaluation results, e.g. average GPT metrics.
- `scores.json`: The results of the red teaming approach.
- `red_teaming_results.png`: A tabular visualisation of the red teaming results.
- `evaluation_report.pdf`: A PDF report containing the graphs and aggregated metrics for the completed evaluation run.
134 changes: 58 additions & 76 deletions app/backend/evaluation/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
from rich.logging import RichHandler

from evaluation import service_setup
from evaluation.config import get_evaluation_config, get_red_teaming_config, load_config
from evaluation.evaluate import run_evaluation_from_config
from evaluation.generate import generate_test_qa_answer, generate_test_qa_data
from evaluation.red_teaming import run_red_teaming
from evaluation.service_setup import get_models_async
from evaluation.utils import load_config

EVALUATION_DIR = Path(__file__).parent
DEFAULT_CONFIG_PATH = EVALUATION_DIR / "config.json"
Expand All @@ -36,7 +34,7 @@
dotenv.load_dotenv(override=True)

get_model_url = os.environ.get("BACKEND_URI") + "/getmodels"
available_models = asyncio.run(get_models_async(get_model_url))
available_models = asyncio.run(service_setup.get_models_async(get_model_url))


def int_or_none(raw: str) -> Optional[int]:
Expand All @@ -48,7 +46,14 @@ def str_or_none(raw: str) -> Optional[str]:


@app.command()
def evaluate(
def run(
evaluation: bool = typer.Option(True, help="Enable or disable running the evaluation."),
red_teaming: bool = typer.Option(True, help="Enable or disable running the red teaming evaluation."),
target_url: Optional[str] = typer.Option(
help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
default=None,
parser=str_or_none,
),
config: Path = typer.Option(
exists=True,
dir_okay=False,
Expand All @@ -60,30 +65,62 @@ def evaluate(
),
default=DEFAULT_CONFIG_PATH,
),
num_questions: Optional[int] = typer.Option(
help="Number of questions to evaluate (defaults to all if not specified).",
default=None,
parser=int_or_none,
),
target_url: Optional[str] = typer.Option(
help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
default=None,
parser=str_or_none,
),
report_output: Optional[Path] = typer.Option(
help="Path to the PDF report output file (defaults to not generating a report).",
help="Path for the PDF report output file to be generated.",
default=None,
dir_okay=False,
file_okay=True,
),
evaluation_num_questions: Optional[int] = typer.Option(
help="Number of questions to use for GPT evaluation (defaults to all if not specified).",
default=None,
parser=int_or_none,
),
red_teaming_prompt_target: Optional[str] = typer.Option(
default="application",
help="Specify the target of the red teaming approach. Must be one of: 'application', 'azureopenai', 'azureml'.",
),
red_teaming_scorer_dir: Path = typer.Option(
exists=True,
dir_okay=True,
file_okay=False,
help="Path to the directory where the scorer YAML files are stored.",
default=DEFAULT_SCORER_DIR,
),
red_teaming_max_turns: int = typer.Option(
default=3, help="The maximum number of turns to apply the attack strategy for."
),
):
success, result = asyncio.run(
run_evaluation_from_config(EVALUATION_DIR, load_config(config), num_questions, target_url, report_output)
config = load_config(config)

evaluation_config = get_evaluation_config(
enabled=evaluation,
num_questions=evaluation_num_questions,
target_url=target_url,
)
if success:
typer.echo(f"Evaluation completed successfully, results saved to: {result.absolute().as_posix()}")

red_teaming_config = get_red_teaming_config(
enabled=red_teaming,
scorer_dir=red_teaming_scorer_dir,
prompt_target=red_teaming_prompt_target,
max_turns=red_teaming_max_turns,
config=config,
target_url=target_url,
)

report_path = asyncio.run(
run_evaluation_from_config(
working_dir=EVALUATION_DIR,
config=config,
evaluation_config=evaluation_config,
red_teaming_config=red_teaming_config,
report_output=report_output,
)
)
if report_path:
typer.echo(f"Evaluation completed successfully, results saved to {report_path.absolute().as_posix()}")
else:
typer.echo("Evaluation failed: " + result)
typer.echo("Evaluation failed")


@app.command()
Expand Down Expand Up @@ -131,60 +168,5 @@ def generate_answers(
)


@app.command()
def red_teaming(
config: Path = typer.Option(
exists=True,
dir_okay=False,
file_okay=True,
help=(
"Path to the configuration JSON file."
" Edit the JSON file to specify the list of models to be evaluated/compared."
f" The available models are: {', '.join(available_models)}"
),
default=DEFAULT_CONFIG_PATH,
),
scorer_dir: Path = typer.Option(
exists=True,
dir_okay=True,
file_okay=False,
help="Path to the directory where the scorer YAML files are stored.",
default=DEFAULT_SCORER_DIR,
),
prompt_target: Optional[str] = typer.Option(
default="application",
help="Specify the target for the prompt. Must be one of: 'application', 'azureopenai', 'azureml'.",
),
target_url: Optional[str] = typer.Option(
help="URL of the target service to evaluate (defaults to the value of the BACKEND_URI environment variable).",
default=None,
parser=str_or_none,
),
max_turns: int = typer.Option(default=3, help="The maximum number of turns to apply the attack strategy for."),
):
config = load_config(config)
red_team = service_setup.get_openai_target()
if prompt_target == "application":
target = service_setup.get_app_target(config, target_url)
elif prompt_target == "azureopenai":
target = service_setup.get_openai_target()
elif prompt_target == "azureml":
target = service_setup.get_azure_ml_chat_target()
else:
raise ValueError(
f"Invalid prompt_target value: {prompt_target}. Must be one of 'application', 'azureopenai', 'azureml'"
)
asyncio.run(
run_red_teaming(
working_dir=EVALUATION_DIR,
scorer_dir=scorer_dir,
config=config,
red_teaming_llm=red_team,
prompt_target=target,
max_turns=max_turns,
)
)


def cli():
app()
Loading

0 comments on commit 6f2add5

Please sign in to comment.