Skip to content

Full Model Evaluation #65

Full Model Evaluation

Full Model Evaluation #65

name: Full Model Evaluation
on:
workflow_dispatch:
inputs:
purge:
description: 'Whether to purge the deployment after evaluation'
required: true
default: false
type: boolean
permissions:
id-token: write
contents: read
jobs:
deploy-azure:
uses: ./.github/workflows/azure-dev.yml
secrets: inherit
evaluate-models:
needs: deploy-azure
runs-on: ubuntu-latest
strategy:
matrix:
python_version: ["3.11"]
env:
# az/azd credentials
AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }}
AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
steps:
#--------------------------Setup--------------------------------
- name: Checkout
uses: actions/checkout@v4
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install azd
uses: Azure/[email protected]
#--------------------------Azure Login--------------------------------
- name: Azure login
uses: azure/login@v2
with:
client-id: ${{ env.AZURE_CLIENT_ID }}
tenant-id: ${{ env.AZURE_TENANT_ID }}
subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }}
#--------------------------Run Evaluation--------------------------------
- name: Info debug
run: |
echo "$GITHUB_ENV"
- name: Install dependencies
working-directory: ./evaluation
run: |
python -m pip install --upgrade pip
pip install -r scripts/requirements.txt
pip install pyrit
- name: Run AI Rag Evaluations
working-directory: ./evaluation
run: |
python -m scripts evaluate --config=./config.json --numquestions=10
echo "EVALUATION_RESULTS=evaluation/results/$(ls ./results/ | grep "experiment" | tail -n 1)" >> $GITHUB_ENV
env:
OPENAI_HOST: ${{ vars.OPENAI_HOST == '' && env.OPENAI_HOST || vars.OPENAI_HOST }}
AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE == '' && env.AZURE_OPENAI_SERVICE || vars.AZURE_OPENAI_SERVICE }}
AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE == '' && env.AZURE_SEARCH_SERVICE || vars.AZURE_SEARCH_SERVICE }}
OPENAI_GPT_MODEL: ${{ vars.OPENAI_GPT_MODEL == '' && env.OPENAI_GPT_MODEL || vars.OPENAI_GPT_MODEL }}
AZURE_OPENAI_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT == '' && env.AZURE_OPENAI_EVAL_DEPLOYMENT || vars.AZURE_OPENAI_EVAL_DEPLOYMENT }}
AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX == '' && env.AZURE_SEARCH_INDEX || vars.AZURE_SEARCH_INDEX }}
AZURE_PRINCIPAL_ID: ${{ secrets.AZURE_PRINCIPAL_ID == '' && env.AZURE_PRINCIPAL_ID || secrets.AZURE_PRINCIPAL_ID }}
- name: Run Red Teaming Evaluations
working-directory: ./evaluation
run: |
python -m scripts red-teaming
env:
OPENAI_HOST: ${{ vars.OPENAI_HOST == '' && env.OPENAI_HOST || vars.OPENAI_HOST }}
AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE == '' && env.AZURE_OPENAI_SERVICE || vars.AZURE_OPENAI_SERVICE }}
AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE == '' && env.AZURE_SEARCH_SERVICE || vars.AZURE_SEARCH_SERVICE }}
OPENAI_GPT_MODEL: ${{ vars.OPENAI_GPT_MODEL == '' && env.OPENAI_GPT_MODEL || vars.OPENAI_GPT_MODEL }}
AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX == '' && env.AZURE_SEARCH_INDEX || vars.AZURE_SEARCH_INDEX }}
AZURE_PRINCIPAL_ID: ${{ secrets.AZURE_PRINCIPAL_ID == '' && env.AZURE_PRINCIPAL_ID || secrets.AZURE_PRINCIPAL_ID }}
AZURE_OPENAI_CHAT_ENDPOINT: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT == '' && env.AZURE_OPENAI_CHAT_DEPLOYMENT || vars.AZURE_OPENAI_EVAL_DEPLOYMENT }}
AZURE_OPENAI_EVAL_ENDPOINT: ${{ vars.AZURE_OPENAI_CHAT_ENDPOINT == '' && env.AZURE_OPENAI_CHAT_ENDPOINT || vars.AZURE_OPENAI_CHAT_ENDPOINT }}
AZURE_OPENAI_CHAT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT == '' && env.AZURE_OPENAI_CHAT_DEPLOYMENT || vars.AZURE_OPENAI_CHAT_DEPLOYMENT }}
AZURE_OPENAI_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHAT_ENDPOINT == '' && env.AZURE_OPENAI_CHAT_ENDPOINT || vars.AZURE_OPENAI_CHAT_ENDPOINT }}
- name: Dump results
uses: actions/upload-artifact@v4
with:
name: evaluation-results
path: |
${{ env.EVALUATION_RESULTS }}/summary.json
${{ env.EVALUATION_RESULTS }}/eval_results.jsonl
${{ env.EVALUATION_RESULTS }}/config.json
${{ env.EVALUATION_RESULTS }}/evaluate_parameters.json
evaluation/eval.png
evaluation/mean_score.png
evaluation/passing_rate.png
#--------------------------Cleanup--------------------------------
- name: Cleanup
if: ${{ inputs.purge == true }}
run: azd down --force --purge