Skip to content

Commit

Permalink
Evaluator (#548)
Browse files Browse the repository at this point in the history
* Evaluator

* Removing example file

* Python test creates a Json result

* Removing pycache

* Spotlless aply

* Static Website

* Changes in evaluator module

* New evaluator-example module

* Removing pytest cache

* Creating static website for test results

* README updated

* Changes in the module structure

* README updated
  • Loading branch information
javipacheco authored Nov 22, 2023
1 parent cfaa3ff commit 95de0d5
Show file tree
Hide file tree
Showing 14 changed files with 549 additions and 0 deletions.
38 changes: 38 additions & 0 deletions evaluator-example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
## Evaluator Example

This is an example of how to use the evaluator. The evaluator is a tool that
can be used to evaluate the performance of a model on a dataset.
It can be used to evaluate the performance of a model on a dataset,
or to compare the performance of multiple models on a dataset.

This module contains an example that you only have to copy to your project and
adapt to your needs.

### Pre-requisites

You need to have the following installed:

- [Install Poetry](https://python-poetry.org/docs/#installing-with-pipx)
- **Python 3.10.0:** you can configure it with virtualenv
```bash
virtualenv venv --python=python3.10.0.
source venv/bin/activate.
```

When you have Poetry installed, you can install the dependencies. You have to
move to `evalTest` folder and execute the following command:

```bash
poetry install
```

### Usage

To try this example, you can run the following command:

```bash
./gradlew evaluator
```

After running the command, you will have the results saved
in a web, that you can see opening the file: `evalTest/index.html`
60 changes: 60 additions & 0 deletions evaluator-example/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import java.io.OutputStream

plugins {
id(libs.plugins.kotlin.jvm.get().pluginId)
id(libs.plugins.kotlinx.serialization.get().pluginId)
alias(libs.plugins.spotless)
}

repositories { mavenCentral() }

java {
sourceCompatibility = JavaVersion.VERSION_11
targetCompatibility = JavaVersion.VERSION_11
toolchain { languageVersion = JavaLanguageVersion.of(11) }
}

dependencies {
implementation(projects.xefCore)
implementation(projects.xefOpenai)
implementation(projects.xefEvaluator)
implementation(libs.suspendApp.core)
implementation(libs.bundles.arrow)
}

spotless {
kotlin {
target("**/*.kt")
ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) }
}
}

tasks.create<JavaExec>("test-example") {
dependsOn("compileKotlin")

workingDir("./evalTest")

group = "Execution"
description = "Test example"
classpath = sourceSets.main.get().runtimeClasspath
mainClass = "com.xebia.funcional.xef.evaluator.examples.TestExample"

doLast {
println(">> data.json created!")
}
}

tasks.create<Exec>("evaluator") {
dependsOn("test-example")

this.standardOutput = OutputStream.nullOutputStream()

workingDir("./evalTest")

commandLine("poetry", "run", "deepeval", "test", "run", "py-evaluator/test_evaluator.py")

doLast {
println(">> Open evalTest/publish/index.html in your browser")
}
}

6 changes: 6 additions & 0 deletions evaluator-example/evalTest/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__pycache__
results.json
data.json
publish/content.js
.pytest_cache
poetry.lock
13 changes: 13 additions & 0 deletions evaluator-example/evalTest/publish/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="es">
<head>
<meta charset="UTF-8">
<title>Tests</title>
<link rel="stylesheet" href="styles.css">
<script src="content.js"></script>
<script src="script.js" defer></script>
</head>
<body>
<div id="test-container"></div>
</body>
</html>
60 changes: 60 additions & 0 deletions evaluator-example/evalTest/publish/script.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
document.addEventListener('DOMContentLoaded', function() {

const container = document.getElementById('test-container');
const summaryDiv = document.createElement('div');
summaryDiv.classList.add('test-summary');

testData.results.forEach(block => {
const blockDiv = document.createElement('div');
blockDiv.classList.add('test-block');

const title = document.createElement('h2');
title.classList.add('test-title');
title.textContent = block.description;
blockDiv.appendChild(title);

block.tests.forEach(test => {
const inputDiv = document.createElement('div');
inputDiv.classList.add(test.assert ? 'input-passed' : 'input-failed');
inputDiv.textContent = 'Input: ' + test.input;
blockDiv.appendChild(inputDiv);

const outputDiv = document.createElement('div');
outputDiv.classList.add('output');
outputDiv.textContent = 'Output: ' + test.output;
outputDiv.addEventListener('click', function() {
this.classList.toggle('expanded');
});
blockDiv.appendChild(outputDiv);

const scoreDiv = document.createElement('div');
scoreDiv.classList.add('score', test.assert ? 'score-passed' : 'score-failed');
scoreDiv.textContent = 'Score: ' + test.score.toFixed(3);
blockDiv.appendChild(scoreDiv);
});

const avgScoreDiv = document.createElement('div');
avgScoreDiv.classList.add('avg-score');
avgScoreDiv.textContent = 'Average Score: ' + block.avg.toFixed(3);
blockDiv.appendChild(avgScoreDiv);

const testInfoDiv = document.createElement('div');
testInfoDiv.classList.add('test-info');
testInfoDiv.innerHTML = `
Tests Passed: ${block.tests_successful} <br>
Tests Failed: ${block.tests_failures} <br>
Success Rate: ${block.success_rate.toFixed(2)}%
`;
blockDiv.appendChild(testInfoDiv);

container.appendChild(blockDiv);

summaryDiv.innerHTML += `
<h3>${block.description}</h3>
Average Score: ${block.avg.toFixed(3)} <br>
Success Rate: ${block.success_rate.toFixed(2)}% <br><br>
`;
});

container.appendChild(summaryDiv);
});
87 changes: 87 additions & 0 deletions evaluator-example/evalTest/publish/styles.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #f4f4f4;
}

#test-container {
width: 80%;
margin: 20px auto;
padding: 15px;
background-color: white;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

.test-block {
margin-bottom: 20px;
border-bottom: 1px solid #eee;
padding-bottom: 20px;
}

.test-title {
font-size: 1.2em;
color: #333;
}

.input, .output {
margin: 5px 0;
}

.input-passed {
margin-top: 25px;
color: green;
font-weight: bold;
}

.input-failed {
margin-top: 25px;
color: red;
font-weight: bold;
}

.output {
color: #666;
cursor: pointer;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}

.output.expanded {
white-space: normal;
}

.score {
font-weight: bold;
}

.score-passed {
margin-bottom: 25px;
color: #008000;
}

.score-failed {
margin-bottom: 25px;
color: red;
}

.avg-score, .test-info {
font-size: 1.2em;
color: #d35400;
margin-top: 10px;
}

.test-summary {
background-color: #e7e7e7;
padding: 15px;
margin-top: 20px;
border-radius: 8px;
}

.test-summary h3 {
font-size: 1.1em;
color: #555;
margin-top: 0;
}
96 changes: 96 additions & 0 deletions evaluator-example/evalTest/py-evaluator/test_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric
from deepeval.metrics.factual_consistency import FactualConsistencyMetric
from deepeval.test_case import LLMTestCase
from deepeval.evaluator import execute_test
import json

f = open('data.json')
data = json.load(f)

appDescription = data['description']

outputs = data['outputs_description']

numberOfOutputs = len(outputs)
minimumScore = float(data['minimum_score'])
metric = data['metric']

print()
print()
print(appDescription)
print("================")
print()
print(f"Using {metric} metric with {numberOfOutputs} different outputs ({minimumScore} minimum score)")

currentOutput = 0

metricObj = FactualConsistencyMetric(minimum_score=minimumScore)

if metric == "AnswerRelevancyMetric":
metricObj = AnswerRelevancyMetric(minimum_score=minimumScore)

jsonResponse = {
"description": appDescription,
}

jsonItemResultResponses = []

for x in range(numberOfOutputs):
jsonItemResponse = {
"description": outputs[x],

}
cases = []
for item in data['items']:
context = []
if "context" in item:
context = item['context']
cases.append(LLMTestCase(input=item['input'], actual_output=item['actual_outputs'][x], context=context))

print()
results = execute_test(cases, [metricObj])
print(f"Results: {outputs[x]}:")
totalScore = 0

jsonResultResponses = []

numberTestSuccessful = 0
for r in results:
score = float(r.metrics[0].score)
testsSuccessful = score >= minimumScore
jsonResultResponse = {
"input": r.input,
"output": r.actual_output,
"score": score,
"assert": testsSuccessful
}
if testsSuccessful:
numberTestSuccessful += 1
jsonResultResponses.append(jsonResultResponse)
totalScore += r.metrics[0].score
print(f"- {r.input} -> {r.metrics[0].score}")
avg = totalScore / len(results)
successRate = numberTestSuccessful * 100 / len(results)
jsonItemResponse["tests"] = jsonResultResponses
jsonItemResponse["avg"] = avg
jsonItemResponse["tests_successful"] = numberTestSuccessful
jsonItemResponse["tests_failures"] = len(results) - numberTestSuccessful
jsonItemResponse["success_rate"] = successRate
jsonItemResultResponses.append(jsonItemResponse)
print()
print(f"Average: {avg}:")
print(f"Success rate: {successRate}:")
print()

jsonResponse["results"] = jsonItemResultResponses

with open("results.json", "w") as outfile:
json.dump(jsonResponse, outfile)

with open("publish/content.js", "w") as outfile:
jsonStr = json.dumps(jsonResponse)
outfile.write(f"const testData = {jsonStr};")

print()

f.close()
13 changes: 13 additions & 0 deletions evaluator-example/evalTest/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[tool.poetry]
name = "py-evaluator"
version = "0.1.0"
description = "Python evaluator for DeepEval"
authors = ["Xef"]

[tool.poetry.dependencies]
python = "~3.10.0"
deepeval = "0.20.19"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Loading

0 comments on commit 95de0d5

Please sign in to comment.