-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Evaluator * Removing example file * Python test creates a Json result * Removing pycache * Spotlless aply * Static Website * Changes in evaluator module * New evaluator-example module * Removing pytest cache * Creating static website for test results * README updated * Changes in the module structure * README updated
- Loading branch information
1 parent
cfaa3ff
commit 95de0d5
Showing
14 changed files
with
549 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
## Evaluator Example | ||
|
||
This is an example of how to use the evaluator. The evaluator is a tool that | ||
can be used to evaluate the performance of a model on a dataset. | ||
It can be used to evaluate the performance of a model on a dataset, | ||
or to compare the performance of multiple models on a dataset. | ||
|
||
This module contains an example that you only have to copy to your project and | ||
adapt to your needs. | ||
|
||
### Pre-requisites | ||
|
||
You need to have the following installed: | ||
|
||
- [Install Poetry](https://python-poetry.org/docs/#installing-with-pipx) | ||
- **Python 3.10.0:** you can configure it with virtualenv | ||
```bash | ||
virtualenv venv --python=python3.10.0. | ||
source venv/bin/activate. | ||
``` | ||
|
||
When you have Poetry installed, you can install the dependencies. You have to | ||
move to `evalTest` folder and execute the following command: | ||
|
||
```bash | ||
poetry install | ||
``` | ||
|
||
### Usage | ||
|
||
To try this example, you can run the following command: | ||
|
||
```bash | ||
./gradlew evaluator | ||
``` | ||
|
||
After running the command, you will have the results saved | ||
in a web, that you can see opening the file: `evalTest/index.html` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import java.io.OutputStream | ||
|
||
plugins { | ||
id(libs.plugins.kotlin.jvm.get().pluginId) | ||
id(libs.plugins.kotlinx.serialization.get().pluginId) | ||
alias(libs.plugins.spotless) | ||
} | ||
|
||
repositories { mavenCentral() } | ||
|
||
java { | ||
sourceCompatibility = JavaVersion.VERSION_11 | ||
targetCompatibility = JavaVersion.VERSION_11 | ||
toolchain { languageVersion = JavaLanguageVersion.of(11) } | ||
} | ||
|
||
dependencies { | ||
implementation(projects.xefCore) | ||
implementation(projects.xefOpenai) | ||
implementation(projects.xefEvaluator) | ||
implementation(libs.suspendApp.core) | ||
implementation(libs.bundles.arrow) | ||
} | ||
|
||
spotless { | ||
kotlin { | ||
target("**/*.kt") | ||
ktfmt().googleStyle().configure { it.setRemoveUnusedImport(true) } | ||
} | ||
} | ||
|
||
tasks.create<JavaExec>("test-example") { | ||
dependsOn("compileKotlin") | ||
|
||
workingDir("./evalTest") | ||
|
||
group = "Execution" | ||
description = "Test example" | ||
classpath = sourceSets.main.get().runtimeClasspath | ||
mainClass = "com.xebia.funcional.xef.evaluator.examples.TestExample" | ||
|
||
doLast { | ||
println(">> data.json created!") | ||
} | ||
} | ||
|
||
tasks.create<Exec>("evaluator") { | ||
dependsOn("test-example") | ||
|
||
this.standardOutput = OutputStream.nullOutputStream() | ||
|
||
workingDir("./evalTest") | ||
|
||
commandLine("poetry", "run", "deepeval", "test", "run", "py-evaluator/test_evaluator.py") | ||
|
||
doLast { | ||
println(">> Open evalTest/publish/index.html in your browser") | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
__pycache__ | ||
results.json | ||
data.json | ||
publish/content.js | ||
.pytest_cache | ||
poetry.lock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<!DOCTYPE html> | ||
<html lang="es"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>Tests</title> | ||
<link rel="stylesheet" href="styles.css"> | ||
<script src="content.js"></script> | ||
<script src="script.js" defer></script> | ||
</head> | ||
<body> | ||
<div id="test-container"></div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
document.addEventListener('DOMContentLoaded', function() { | ||
|
||
const container = document.getElementById('test-container'); | ||
const summaryDiv = document.createElement('div'); | ||
summaryDiv.classList.add('test-summary'); | ||
|
||
testData.results.forEach(block => { | ||
const blockDiv = document.createElement('div'); | ||
blockDiv.classList.add('test-block'); | ||
|
||
const title = document.createElement('h2'); | ||
title.classList.add('test-title'); | ||
title.textContent = block.description; | ||
blockDiv.appendChild(title); | ||
|
||
block.tests.forEach(test => { | ||
const inputDiv = document.createElement('div'); | ||
inputDiv.classList.add(test.assert ? 'input-passed' : 'input-failed'); | ||
inputDiv.textContent = 'Input: ' + test.input; | ||
blockDiv.appendChild(inputDiv); | ||
|
||
const outputDiv = document.createElement('div'); | ||
outputDiv.classList.add('output'); | ||
outputDiv.textContent = 'Output: ' + test.output; | ||
outputDiv.addEventListener('click', function() { | ||
this.classList.toggle('expanded'); | ||
}); | ||
blockDiv.appendChild(outputDiv); | ||
|
||
const scoreDiv = document.createElement('div'); | ||
scoreDiv.classList.add('score', test.assert ? 'score-passed' : 'score-failed'); | ||
scoreDiv.textContent = 'Score: ' + test.score.toFixed(3); | ||
blockDiv.appendChild(scoreDiv); | ||
}); | ||
|
||
const avgScoreDiv = document.createElement('div'); | ||
avgScoreDiv.classList.add('avg-score'); | ||
avgScoreDiv.textContent = 'Average Score: ' + block.avg.toFixed(3); | ||
blockDiv.appendChild(avgScoreDiv); | ||
|
||
const testInfoDiv = document.createElement('div'); | ||
testInfoDiv.classList.add('test-info'); | ||
testInfoDiv.innerHTML = ` | ||
Tests Passed: ${block.tests_successful} <br> | ||
Tests Failed: ${block.tests_failures} <br> | ||
Success Rate: ${block.success_rate.toFixed(2)}% | ||
`; | ||
blockDiv.appendChild(testInfoDiv); | ||
|
||
container.appendChild(blockDiv); | ||
|
||
summaryDiv.innerHTML += ` | ||
<h3>${block.description}</h3> | ||
Average Score: ${block.avg.toFixed(3)} <br> | ||
Success Rate: ${block.success_rate.toFixed(2)}% <br><br> | ||
`; | ||
}); | ||
|
||
container.appendChild(summaryDiv); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
body { | ||
font-family: Arial, sans-serif; | ||
margin: 0; | ||
padding: 0; | ||
background-color: #f4f4f4; | ||
} | ||
|
||
#test-container { | ||
width: 80%; | ||
margin: 20px auto; | ||
padding: 15px; | ||
background-color: white; | ||
border-radius: 8px; | ||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | ||
} | ||
|
||
.test-block { | ||
margin-bottom: 20px; | ||
border-bottom: 1px solid #eee; | ||
padding-bottom: 20px; | ||
} | ||
|
||
.test-title { | ||
font-size: 1.2em; | ||
color: #333; | ||
} | ||
|
||
.input, .output { | ||
margin: 5px 0; | ||
} | ||
|
||
.input-passed { | ||
margin-top: 25px; | ||
color: green; | ||
font-weight: bold; | ||
} | ||
|
||
.input-failed { | ||
margin-top: 25px; | ||
color: red; | ||
font-weight: bold; | ||
} | ||
|
||
.output { | ||
color: #666; | ||
cursor: pointer; | ||
white-space: nowrap; | ||
overflow: hidden; | ||
text-overflow: ellipsis; | ||
} | ||
|
||
.output.expanded { | ||
white-space: normal; | ||
} | ||
|
||
.score { | ||
font-weight: bold; | ||
} | ||
|
||
.score-passed { | ||
margin-bottom: 25px; | ||
color: #008000; | ||
} | ||
|
||
.score-failed { | ||
margin-bottom: 25px; | ||
color: red; | ||
} | ||
|
||
.avg-score, .test-info { | ||
font-size: 1.2em; | ||
color: #d35400; | ||
margin-top: 10px; | ||
} | ||
|
||
.test-summary { | ||
background-color: #e7e7e7; | ||
padding: 15px; | ||
margin-top: 20px; | ||
border-radius: 8px; | ||
} | ||
|
||
.test-summary h3 { | ||
font-size: 1.1em; | ||
color: #555; | ||
margin-top: 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric | ||
from deepeval.metrics.factual_consistency import FactualConsistencyMetric | ||
from deepeval.test_case import LLMTestCase | ||
from deepeval.evaluator import execute_test | ||
import json | ||
|
||
f = open('data.json') | ||
data = json.load(f) | ||
|
||
appDescription = data['description'] | ||
|
||
outputs = data['outputs_description'] | ||
|
||
numberOfOutputs = len(outputs) | ||
minimumScore = float(data['minimum_score']) | ||
metric = data['metric'] | ||
|
||
print() | ||
print() | ||
print(appDescription) | ||
print("================") | ||
print() | ||
print(f"Using {metric} metric with {numberOfOutputs} different outputs ({minimumScore} minimum score)") | ||
|
||
currentOutput = 0 | ||
|
||
metricObj = FactualConsistencyMetric(minimum_score=minimumScore) | ||
|
||
if metric == "AnswerRelevancyMetric": | ||
metricObj = AnswerRelevancyMetric(minimum_score=minimumScore) | ||
|
||
jsonResponse = { | ||
"description": appDescription, | ||
} | ||
|
||
jsonItemResultResponses = [] | ||
|
||
for x in range(numberOfOutputs): | ||
jsonItemResponse = { | ||
"description": outputs[x], | ||
|
||
} | ||
cases = [] | ||
for item in data['items']: | ||
context = [] | ||
if "context" in item: | ||
context = item['context'] | ||
cases.append(LLMTestCase(input=item['input'], actual_output=item['actual_outputs'][x], context=context)) | ||
|
||
print() | ||
results = execute_test(cases, [metricObj]) | ||
print(f"Results: {outputs[x]}:") | ||
totalScore = 0 | ||
|
||
jsonResultResponses = [] | ||
|
||
numberTestSuccessful = 0 | ||
for r in results: | ||
score = float(r.metrics[0].score) | ||
testsSuccessful = score >= minimumScore | ||
jsonResultResponse = { | ||
"input": r.input, | ||
"output": r.actual_output, | ||
"score": score, | ||
"assert": testsSuccessful | ||
} | ||
if testsSuccessful: | ||
numberTestSuccessful += 1 | ||
jsonResultResponses.append(jsonResultResponse) | ||
totalScore += r.metrics[0].score | ||
print(f"- {r.input} -> {r.metrics[0].score}") | ||
avg = totalScore / len(results) | ||
successRate = numberTestSuccessful * 100 / len(results) | ||
jsonItemResponse["tests"] = jsonResultResponses | ||
jsonItemResponse["avg"] = avg | ||
jsonItemResponse["tests_successful"] = numberTestSuccessful | ||
jsonItemResponse["tests_failures"] = len(results) - numberTestSuccessful | ||
jsonItemResponse["success_rate"] = successRate | ||
jsonItemResultResponses.append(jsonItemResponse) | ||
print() | ||
print(f"Average: {avg}:") | ||
print(f"Success rate: {successRate}:") | ||
print() | ||
|
||
jsonResponse["results"] = jsonItemResultResponses | ||
|
||
with open("results.json", "w") as outfile: | ||
json.dump(jsonResponse, outfile) | ||
|
||
with open("publish/content.js", "w") as outfile: | ||
jsonStr = json.dumps(jsonResponse) | ||
outfile.write(f"const testData = {jsonStr};") | ||
|
||
print() | ||
|
||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[tool.poetry] | ||
name = "py-evaluator" | ||
version = "0.1.0" | ||
description = "Python evaluator for DeepEval" | ||
authors = ["Xef"] | ||
|
||
[tool.poetry.dependencies] | ||
python = "~3.10.0" | ||
deepeval = "0.20.19" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" |
Oops, something went wrong.